aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit_watch.c85
-rw-r--r--kernel/cgroup.c54
-rw-r--r--kernel/compat.c136
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/futex.c147
-rw-r--r--kernel/hrtimer.c90
-rw-r--r--kernel/irq/Kconfig39
-rw-r--r--kernel/irq/autoprobe.c54
-rw-r--r--kernel/irq/chip.c483
-rw-r--r--kernel/irq/compat.h72
-rw-r--r--kernel/irq/debug.h40
-rw-r--r--kernel/irq/handle.c144
-rw-r--r--kernel/irq/internals.h167
-rw-r--r--kernel/irq/irqdesc.c68
-rw-r--r--kernel/irq/manage.c602
-rw-r--r--kernel/irq/migration.c38
-rw-r--r--kernel/irq/pm.c30
-rw-r--r--kernel/irq/proc.c70
-rw-r--r--kernel/irq/resend.c17
-rw-r--r--kernel/irq/settings.h138
-rw-r--r--kernel/irq/spurious.c163
-rw-r--r--kernel/perf_event.c1004
-rw-r--r--kernel/posix-cpu-timers.c110
-rw-r--r--kernel/posix-timers.c342
-rw-r--r--kernel/rcupdate.c10
-rw-r--r--kernel/rcutiny_plugin.h2
-rw-r--r--kernel/rcutorture.c1
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c40
-rw-r--r--kernel/rtmutex.c318
-rw-r--r--kernel/rtmutex_common.h16
-rw-r--r--kernel/sched.c338
-rw-r--r--kernel/sched_autogroup.c15
-rw-r--r--kernel/sched_autogroup.h5
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c397
-rw-r--r--kernel/sched_idletask.c26
-rw-r--r--kernel/sched_rt.c19
-rw-r--r--kernel/sched_stoptask.c7
-rw-r--r--kernel/softirq.c24
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/sysctl_binary.c19
-rw-r--r--kernel/time.c35
-rw-r--r--kernel/time/Makefile3
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/jiffies.c20
-rw-r--r--kernel/time/ntp.c13
-rw-r--r--kernel/time/posix-clock.c451
-rw-r--r--kernel/time/tick-broadcast.c1
-rw-r--r--kernel/time/tick-common.c1
-rw-r--r--kernel/time/tick-internal.h9
-rw-r--r--kernel/time/tick-oneshot.c1
-rw-r--r--kernel/time/tick-sched.c1
-rw-r--r--kernel/time/timekeeping.c141
-rw-r--r--kernel/timer.c42
-rw-r--r--kernel/trace/ftrace.c52
-rw-r--r--kernel/trace/ring_buffer.c24
-rw-r--r--kernel/trace/trace.c38
-rw-r--r--kernel/trace/trace.h41
-rw-r--r--kernel/trace/trace_entries.h6
-rw-r--r--kernel/trace/trace_events.c2
-rw-r--r--kernel/trace/trace_events_filter.c885
-rw-r--r--kernel/trace/trace_kprobe.c111
-rw-r--r--kernel/trace/trace_output.c36
-rw-r--r--kernel/trace/trace_sched_switch.c48
-rw-r--r--kernel/trace/trace_syscalls.c42
-rw-r--r--kernel/workqueue.c6
68 files changed, 5161 insertions, 2200 deletions
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d2e3c7866460..e683869365d9 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
144} 144}
145 145
146/* Initialize a parent watch entry. */ 146/* Initialize a parent watch entry. */
147static struct audit_parent *audit_init_parent(struct nameidata *ndp) 147static struct audit_parent *audit_init_parent(struct path *path)
148{ 148{
149 struct inode *inode = ndp->path.dentry->d_inode; 149 struct inode *inode = path->dentry->d_inode;
150 struct audit_parent *parent; 150 struct audit_parent *parent;
151 int ret; 151 int ret;
152 152
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
353} 353}
354 354
355/* Get path information necessary for adding watches. */ 355/* Get path information necessary for adding watches. */
356static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) 356static int audit_get_nd(struct audit_watch *watch, struct path *parent)
357{ 357{
358 struct nameidata *ndparent, *ndwatch; 358 struct nameidata nd;
359 struct dentry *d;
359 int err; 360 int err;
360 361
361 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); 362 err = kern_path_parent(watch->path, &nd);
362 if (unlikely(!ndparent)) 363 if (err)
363 return -ENOMEM; 364 return err;
364 365
365 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); 366 if (nd.last_type != LAST_NORM) {
366 if (unlikely(!ndwatch)) { 367 path_put(&nd.path);
367 kfree(ndparent); 368 return -EINVAL;
368 return -ENOMEM;
369 } 369 }
370 370
371 err = path_lookup(path, LOOKUP_PARENT, ndparent); 371 mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
372 if (err) { 372 d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
373 kfree(ndparent); 373 if (IS_ERR(d)) {
374 kfree(ndwatch); 374 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
375 return err; 375 path_put(&nd.path);
376 return PTR_ERR(d);
376 } 377 }
377 378 if (d->d_inode) {
378 err = path_lookup(path, 0, ndwatch); 379 /* update watch filter fields */
379 if (err) { 380 watch->dev = d->d_inode->i_sb->s_dev;
380 kfree(ndwatch); 381 watch->ino = d->d_inode->i_ino;
381 ndwatch = NULL;
382 } 382 }
383 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
383 384
384 *ndp = ndparent; 385 *parent = nd.path;
385 *ndw = ndwatch; 386 dput(d);
386
387 return 0; 387 return 0;
388} 388}
389 389
390/* Release resources used for watch path information. */
391static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
392{
393 if (ndp) {
394 path_put(&ndp->path);
395 kfree(ndp);
396 }
397 if (ndw) {
398 path_put(&ndw->path);
399 kfree(ndw);
400 }
401}
402
403/* Associate the given rule with an existing parent. 390/* Associate the given rule with an existing parent.
404 * Caller must hold audit_filter_mutex. */ 391 * Caller must hold audit_filter_mutex. */
405static void audit_add_to_parent(struct audit_krule *krule, 392static void audit_add_to_parent(struct audit_krule *krule,
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
440{ 427{
441 struct audit_watch *watch = krule->watch; 428 struct audit_watch *watch = krule->watch;
442 struct audit_parent *parent; 429 struct audit_parent *parent;
443 struct nameidata *ndp = NULL, *ndw = NULL; 430 struct path parent_path;
444 int h, ret = 0; 431 int h, ret = 0;
445 432
446 mutex_unlock(&audit_filter_mutex); 433 mutex_unlock(&audit_filter_mutex);
447 434
448 /* Avoid calling path_lookup under audit_filter_mutex. */ 435 /* Avoid calling path_lookup under audit_filter_mutex. */
449 ret = audit_get_nd(watch->path, &ndp, &ndw); 436 ret = audit_get_nd(watch, &parent_path);
450 if (ret) {
451 /* caller expects mutex locked */
452 mutex_lock(&audit_filter_mutex);
453 goto error;
454 }
455 437
438 /* caller expects mutex locked */
456 mutex_lock(&audit_filter_mutex); 439 mutex_lock(&audit_filter_mutex);
457 440
458 /* update watch filter fields */ 441 if (ret)
459 if (ndw) { 442 return ret;
460 watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
461 watch->ino = ndw->path.dentry->d_inode->i_ino;
462 }
463 443
464 /* either find an old parent or attach a new one */ 444 /* either find an old parent or attach a new one */
465 parent = audit_find_parent(ndp->path.dentry->d_inode); 445 parent = audit_find_parent(parent_path.dentry->d_inode);
466 if (!parent) { 446 if (!parent) {
467 parent = audit_init_parent(ndp); 447 parent = audit_init_parent(&parent_path);
468 if (IS_ERR(parent)) { 448 if (IS_ERR(parent)) {
469 ret = PTR_ERR(parent); 449 ret = PTR_ERR(parent);
470 goto error; 450 goto error;
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
479 h = audit_hash_ino((u32)watch->ino); 459 h = audit_hash_ino((u32)watch->ino);
480 *list = &audit_inode_hash[h]; 460 *list = &audit_inode_hash[h];
481error: 461error:
482 audit_put_nd(ndp, ndw); /* NULL args OK */ 462 path_put(&parent_path);
483 return ret; 463 return ret;
484
485} 464}
486 465
487void audit_remove_watch_rule(struct audit_krule *krule) 466void audit_remove_watch_rule(struct audit_krule *krule)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d7027b83c..95362d15128c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child)
4230 */ 4230 */
4231void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4231void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4232{ 4232{
4233 int i;
4234 struct css_set *cg; 4233 struct css_set *cg;
4235 4234 int i;
4236 if (run_callbacks && need_forkexit_callback) {
4237 /*
4238 * modular subsystems can't use callbacks, so no need to lock
4239 * the subsys array
4240 */
4241 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4242 struct cgroup_subsys *ss = subsys[i];
4243 if (ss->exit)
4244 ss->exit(ss, tsk);
4245 }
4246 }
4247 4235
4248 /* 4236 /*
4249 * Unlink from the css_set task list if necessary. 4237 * Unlink from the css_set task list if necessary.
@@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4261 task_lock(tsk); 4249 task_lock(tsk);
4262 cg = tsk->cgroups; 4250 cg = tsk->cgroups;
4263 tsk->cgroups = &init_css_set; 4251 tsk->cgroups = &init_css_set;
4252
4253 if (run_callbacks && need_forkexit_callback) {
4254 /*
4255 * modular subsystems can't use callbacks, so no need to lock
4256 * the subsys array
4257 */
4258 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4259 struct cgroup_subsys *ss = subsys[i];
4260 if (ss->exit) {
4261 struct cgroup *old_cgrp =
4262 rcu_dereference_raw(cg->subsys[i])->cgroup;
4263 struct cgroup *cgrp = task_cgroup(tsk, i);
4264 ss->exit(ss, cgrp, old_cgrp, tsk);
4265 }
4266 }
4267 }
4264 task_unlock(tsk); 4268 task_unlock(tsk);
4269
4265 if (cg) 4270 if (cg)
4266 put_css_set_taskexit(cg); 4271 put_css_set_taskexit(cg);
4267} 4272}
@@ -4813,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
4813 return ret; 4818 return ret;
4814} 4819}
4815 4820
4821/*
4822 * get corresponding css from file open on cgroupfs directory
4823 */
4824struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
4825{
4826 struct cgroup *cgrp;
4827 struct inode *inode;
4828 struct cgroup_subsys_state *css;
4829
4830 inode = f->f_dentry->d_inode;
4831 /* check in cgroup filesystem dir */
4832 if (inode->i_op != &cgroup_dir_inode_operations)
4833 return ERR_PTR(-EBADF);
4834
4835 if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
4836 return ERR_PTR(-EINVAL);
4837
4838 /* get cgroup */
4839 cgrp = __d_cgrp(f->f_dentry);
4840 css = cgrp->subsys[id];
4841 return css ? css : ERR_PTR(-ENOENT);
4842}
4843
4816#ifdef CONFIG_CGROUP_DEBUG 4844#ifdef CONFIG_CGROUP_DEBUG
4817static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 4845static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4818 struct cgroup *cont) 4846 struct cgroup *cont)
diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0b34a8..38b1d2c1cbe8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; 52 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
53} 53}
54 54
55static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
56{
57 memset(txc, 0, sizeof(struct timex));
58
59 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
60 __get_user(txc->modes, &utp->modes) ||
61 __get_user(txc->offset, &utp->offset) ||
62 __get_user(txc->freq, &utp->freq) ||
63 __get_user(txc->maxerror, &utp->maxerror) ||
64 __get_user(txc->esterror, &utp->esterror) ||
65 __get_user(txc->status, &utp->status) ||
66 __get_user(txc->constant, &utp->constant) ||
67 __get_user(txc->precision, &utp->precision) ||
68 __get_user(txc->tolerance, &utp->tolerance) ||
69 __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
70 __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
71 __get_user(txc->tick, &utp->tick) ||
72 __get_user(txc->ppsfreq, &utp->ppsfreq) ||
73 __get_user(txc->jitter, &utp->jitter) ||
74 __get_user(txc->shift, &utp->shift) ||
75 __get_user(txc->stabil, &utp->stabil) ||
76 __get_user(txc->jitcnt, &utp->jitcnt) ||
77 __get_user(txc->calcnt, &utp->calcnt) ||
78 __get_user(txc->errcnt, &utp->errcnt) ||
79 __get_user(txc->stbcnt, &utp->stbcnt))
80 return -EFAULT;
81
82 return 0;
83}
84
85static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
86{
87 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
88 __put_user(txc->modes, &utp->modes) ||
89 __put_user(txc->offset, &utp->offset) ||
90 __put_user(txc->freq, &utp->freq) ||
91 __put_user(txc->maxerror, &utp->maxerror) ||
92 __put_user(txc->esterror, &utp->esterror) ||
93 __put_user(txc->status, &utp->status) ||
94 __put_user(txc->constant, &utp->constant) ||
95 __put_user(txc->precision, &utp->precision) ||
96 __put_user(txc->tolerance, &utp->tolerance) ||
97 __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
98 __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
99 __put_user(txc->tick, &utp->tick) ||
100 __put_user(txc->ppsfreq, &utp->ppsfreq) ||
101 __put_user(txc->jitter, &utp->jitter) ||
102 __put_user(txc->shift, &utp->shift) ||
103 __put_user(txc->stabil, &utp->stabil) ||
104 __put_user(txc->jitcnt, &utp->jitcnt) ||
105 __put_user(txc->calcnt, &utp->calcnt) ||
106 __put_user(txc->errcnt, &utp->errcnt) ||
107 __put_user(txc->stbcnt, &utp->stbcnt) ||
108 __put_user(txc->tai, &utp->tai))
109 return -EFAULT;
110 return 0;
111}
112
55asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, 113asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
56 struct timezone __user *tz) 114 struct timezone __user *tz)
57{ 115{
@@ -617,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
617 return err; 675 return err;
618} 676}
619 677
678long compat_sys_clock_adjtime(clockid_t which_clock,
679 struct compat_timex __user *utp)
680{
681 struct timex txc;
682 mm_segment_t oldfs;
683 int err, ret;
684
685 err = compat_get_timex(&txc, utp);
686 if (err)
687 return err;
688
689 oldfs = get_fs();
690 set_fs(KERNEL_DS);
691 ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
692 set_fs(oldfs);
693
694 err = compat_put_timex(utp, &txc);
695 if (err)
696 return err;
697
698 return ret;
699}
700
620long compat_sys_clock_getres(clockid_t which_clock, 701long compat_sys_clock_getres(clockid_t which_clock,
621 struct compat_timespec __user *tp) 702 struct compat_timespec __user *tp)
622{ 703{
@@ -951,58 +1032,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
951asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) 1032asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
952{ 1033{
953 struct timex txc; 1034 struct timex txc;
954 int ret; 1035 int err, ret;
955
956 memset(&txc, 0, sizeof(struct timex));
957 1036
958 if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || 1037 err = compat_get_timex(&txc, utp);
959 __get_user(txc.modes, &utp->modes) || 1038 if (err)
960 __get_user(txc.offset, &utp->offset) || 1039 return err;
961 __get_user(txc.freq, &utp->freq) ||
962 __get_user(txc.maxerror, &utp->maxerror) ||
963 __get_user(txc.esterror, &utp->esterror) ||
964 __get_user(txc.status, &utp->status) ||
965 __get_user(txc.constant, &utp->constant) ||
966 __get_user(txc.precision, &utp->precision) ||
967 __get_user(txc.tolerance, &utp->tolerance) ||
968 __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
969 __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
970 __get_user(txc.tick, &utp->tick) ||
971 __get_user(txc.ppsfreq, &utp->ppsfreq) ||
972 __get_user(txc.jitter, &utp->jitter) ||
973 __get_user(txc.shift, &utp->shift) ||
974 __get_user(txc.stabil, &utp->stabil) ||
975 __get_user(txc.jitcnt, &utp->jitcnt) ||
976 __get_user(txc.calcnt, &utp->calcnt) ||
977 __get_user(txc.errcnt, &utp->errcnt) ||
978 __get_user(txc.stbcnt, &utp->stbcnt))
979 return -EFAULT;
980 1040
981 ret = do_adjtimex(&txc); 1041 ret = do_adjtimex(&txc);
982 1042
983 if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || 1043 err = compat_put_timex(utp, &txc);
984 __put_user(txc.modes, &utp->modes) || 1044 if (err)
985 __put_user(txc.offset, &utp->offset) || 1045 return err;
986 __put_user(txc.freq, &utp->freq) ||
987 __put_user(txc.maxerror, &utp->maxerror) ||
988 __put_user(txc.esterror, &utp->esterror) ||
989 __put_user(txc.status, &utp->status) ||
990 __put_user(txc.constant, &utp->constant) ||
991 __put_user(txc.precision, &utp->precision) ||
992 __put_user(txc.tolerance, &utp->tolerance) ||
993 __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
994 __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
995 __put_user(txc.tick, &utp->tick) ||
996 __put_user(txc.ppsfreq, &utp->ppsfreq) ||
997 __put_user(txc.jitter, &utp->jitter) ||
998 __put_user(txc.shift, &utp->shift) ||
999 __put_user(txc.stabil, &utp->stabil) ||
1000 __put_user(txc.jitcnt, &utp->jitcnt) ||
1001 __put_user(txc.calcnt, &utp->calcnt) ||
1002 __put_user(txc.errcnt, &utp->errcnt) ||
1003 __put_user(txc.stbcnt, &utp->stbcnt) ||
1004 __put_user(txc.tai, &utp->tai))
1005 ret = -EFAULT;
1006 1046
1007 return ret; 1047 return ret;
1008} 1048}
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a9d6dd53a6c..2343c132c5a7 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
35static struct thread_group_cred init_tgcred = { 35static struct thread_group_cred init_tgcred = {
36 .usage = ATOMIC_INIT(2), 36 .usage = ATOMIC_INIT(2),
37 .tgid = 0, 37 .tgid = 0,
38 .lock = SPIN_LOCK_UNLOCKED, 38 .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
39}; 39};
40#endif 40#endif
41 41
diff --git a/kernel/futex.c b/kernel/futex.c
index b766d28accd6..bda415715382 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
381 return NULL; 381 return NULL;
382} 382}
383 383
384static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 384static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
385 u32 uval, u32 newval)
385{ 386{
386 u32 curval; 387 int ret;
387 388
388 pagefault_disable(); 389 pagefault_disable();
389 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 390 ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
390 pagefault_enable(); 391 pagefault_enable();
391 392
392 return curval; 393 return ret;
393} 394}
394 395
395static int get_futex_value_locked(u32 *dest, u32 __user *from) 396static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
674 struct task_struct *task, int set_waiters) 675 struct task_struct *task, int set_waiters)
675{ 676{
676 int lock_taken, ret, ownerdied = 0; 677 int lock_taken, ret, ownerdied = 0;
677 u32 uval, newval, curval; 678 u32 uval, newval, curval, vpid = task_pid_vnr(task);
678 679
679retry: 680retry:
680 ret = lock_taken = 0; 681 ret = lock_taken = 0;
@@ -684,19 +685,17 @@ retry:
684 * (by doing a 0 -> TID atomic cmpxchg), while holding all 685 * (by doing a 0 -> TID atomic cmpxchg), while holding all
685 * the locks. It will most likely not succeed. 686 * the locks. It will most likely not succeed.
686 */ 687 */
687 newval = task_pid_vnr(task); 688 newval = vpid;
688 if (set_waiters) 689 if (set_waiters)
689 newval |= FUTEX_WAITERS; 690 newval |= FUTEX_WAITERS;
690 691
691 curval = cmpxchg_futex_value_locked(uaddr, 0, newval); 692 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
692
693 if (unlikely(curval == -EFAULT))
694 return -EFAULT; 693 return -EFAULT;
695 694
696 /* 695 /*
697 * Detect deadlocks. 696 * Detect deadlocks.
698 */ 697 */
699 if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) 698 if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
700 return -EDEADLK; 699 return -EDEADLK;
701 700
702 /* 701 /*
@@ -723,14 +722,12 @@ retry:
723 */ 722 */
724 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { 723 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
725 /* Keep the OWNER_DIED bit */ 724 /* Keep the OWNER_DIED bit */
726 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); 725 newval = (curval & ~FUTEX_TID_MASK) | vpid;
727 ownerdied = 0; 726 ownerdied = 0;
728 lock_taken = 1; 727 lock_taken = 1;
729 } 728 }
730 729
731 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 730 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
732
733 if (unlikely(curval == -EFAULT))
734 return -EFAULT; 731 return -EFAULT;
735 if (unlikely(curval != uval)) 732 if (unlikely(curval != uval))
736 goto retry; 733 goto retry;
@@ -775,6 +772,24 @@ retry:
775 return ret; 772 return ret;
776} 773}
777 774
775/**
776 * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
777 * @q: The futex_q to unqueue
778 *
779 * The q->lock_ptr must not be NULL and must be held by the caller.
780 */
781static void __unqueue_futex(struct futex_q *q)
782{
783 struct futex_hash_bucket *hb;
784
785 if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr)
786 || plist_node_empty(&q->list)))
787 return;
788
789 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
790 plist_del(&q->list, &hb->chain);
791}
792
778/* 793/*
779 * The hash bucket lock must be held when this is called. 794 * The hash bucket lock must be held when this is called.
780 * Afterwards, the futex_q must not be accessed. 795 * Afterwards, the futex_q must not be accessed.
@@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q)
792 */ 807 */
793 get_task_struct(p); 808 get_task_struct(p);
794 809
795 plist_del(&q->list, &q->list.plist); 810 __unqueue_futex(q);
796 /* 811 /*
797 * The waiting task can free the futex_q as soon as 812 * The waiting task can free the futex_q as soon as
798 * q->lock_ptr = NULL is written, without taking any locks. A 813 * q->lock_ptr = NULL is written, without taking any locks. A
@@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
843 858
844 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 859 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
845 860
846 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 861 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
847
848 if (curval == -EFAULT)
849 ret = -EFAULT; 862 ret = -EFAULT;
850 else if (curval != uval) 863 else if (curval != uval)
851 ret = -EINVAL; 864 ret = -EINVAL;
@@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
880 * There is no waiter, so we unlock the futex. The owner died 893 * There is no waiter, so we unlock the futex. The owner died
881 * bit has not to be preserved here. We are the owner: 894 * bit has not to be preserved here. We are the owner:
882 */ 895 */
883 oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); 896 if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
884 897 return -EFAULT;
885 if (oldval == -EFAULT)
886 return oldval;
887 if (oldval != uval) 898 if (oldval != uval)
888 return -EAGAIN; 899 return -EAGAIN;
889 900
@@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1071 plist_del(&q->list, &hb1->chain); 1082 plist_del(&q->list, &hb1->chain);
1072 plist_add(&q->list, &hb2->chain); 1083 plist_add(&q->list, &hb2->chain);
1073 q->lock_ptr = &hb2->lock; 1084 q->lock_ptr = &hb2->lock;
1074#ifdef CONFIG_DEBUG_PI_LIST
1075 q->list.plist.spinlock = &hb2->lock;
1076#endif
1077 } 1085 }
1078 get_futex_key_refs(key2); 1086 get_futex_key_refs(key2);
1079 q->key = *key2; 1087 q->key = *key2;
@@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1100 get_futex_key_refs(key); 1108 get_futex_key_refs(key);
1101 q->key = *key; 1109 q->key = *key;
1102 1110
1103 WARN_ON(plist_node_empty(&q->list)); 1111 __unqueue_futex(q);
1104 plist_del(&q->list, &q->list.plist);
1105 1112
1106 WARN_ON(!q->rt_waiter); 1113 WARN_ON(!q->rt_waiter);
1107 q->rt_waiter = NULL; 1114 q->rt_waiter = NULL;
1108 1115
1109 q->lock_ptr = &hb->lock; 1116 q->lock_ptr = &hb->lock;
1110#ifdef CONFIG_DEBUG_PI_LIST
1111 q->list.plist.spinlock = &hb->lock;
1112#endif
1113 1117
1114 wake_up_state(q->task, TASK_NORMAL); 1118 wake_up_state(q->task, TASK_NORMAL);
1115} 1119}
@@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1457 prio = min(current->normal_prio, MAX_RT_PRIO); 1461 prio = min(current->normal_prio, MAX_RT_PRIO);
1458 1462
1459 plist_node_init(&q->list, prio); 1463 plist_node_init(&q->list, prio);
1460#ifdef CONFIG_DEBUG_PI_LIST
1461 q->list.plist.spinlock = &hb->lock;
1462#endif
1463 plist_add(&q->list, &hb->chain); 1464 plist_add(&q->list, &hb->chain);
1464 q->task = current; 1465 q->task = current;
1465 spin_unlock(&hb->lock); 1466 spin_unlock(&hb->lock);
@@ -1504,8 +1505,7 @@ retry:
1504 spin_unlock(lock_ptr); 1505 spin_unlock(lock_ptr);
1505 goto retry; 1506 goto retry;
1506 } 1507 }
1507 WARN_ON(plist_node_empty(&q->list)); 1508 __unqueue_futex(q);
1508 plist_del(&q->list, &q->list.plist);
1509 1509
1510 BUG_ON(q->pi_state); 1510 BUG_ON(q->pi_state);
1511 1511
@@ -1525,8 +1525,7 @@ retry:
1525static void unqueue_me_pi(struct futex_q *q) 1525static void unqueue_me_pi(struct futex_q *q)
1526 __releases(q->lock_ptr) 1526 __releases(q->lock_ptr)
1527{ 1527{
1528 WARN_ON(plist_node_empty(&q->list)); 1528 __unqueue_futex(q);
1529 plist_del(&q->list, &q->list.plist);
1530 1529
1531 BUG_ON(!q->pi_state); 1530 BUG_ON(!q->pi_state);
1532 free_pi_state(q->pi_state); 1531 free_pi_state(q->pi_state);
@@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1556 1555
1557 /* 1556 /*
1558 * We are here either because we stole the rtmutex from the 1557 * We are here either because we stole the rtmutex from the
1559 * pending owner or we are the pending owner which failed to 1558 * previous highest priority waiter or we are the highest priority
1560 * get the rtmutex. We have to replace the pending owner TID 1559 * waiter but failed to get the rtmutex the first time.
1561 * in the user space variable. This must be atomic as we have 1560 * We have to replace the newowner TID in the user space variable.
1562 * to preserve the owner died bit here. 1561 * This must be atomic as we have to preserve the owner died bit here.
1563 * 1562 *
1564 * Note: We write the user space value _before_ changing the pi_state 1563 * Note: We write the user space value _before_ changing the pi_state
1565 * because we can fault here. Imagine swapped out pages or a fork 1564 * because we can fault here. Imagine swapped out pages or a fork
@@ -1578,9 +1577,7 @@ retry:
1578 while (1) { 1577 while (1) {
1579 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1578 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1580 1579
1581 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 1580 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1582
1583 if (curval == -EFAULT)
1584 goto handle_fault; 1581 goto handle_fault;
1585 if (curval == uval) 1582 if (curval == uval)
1586 break; 1583 break;
@@ -1608,8 +1605,8 @@ retry:
1608 1605
1609 /* 1606 /*
1610 * To handle the page fault we need to drop the hash bucket 1607 * To handle the page fault we need to drop the hash bucket
1611 * lock here. That gives the other task (either the pending 1608 * lock here. That gives the other task (either the highest priority
1612 * owner itself or the task which stole the rtmutex) the 1609 * waiter itself or the task which stole the rtmutex) the
1613 * chance to try the fixup of the pi_state. So once we are 1610 * chance to try the fixup of the pi_state. So once we are
1614 * back from handling the fault we need to check the pi_state 1611 * back from handling the fault we need to check the pi_state
1615 * after reacquiring the hash bucket lock and before trying to 1612 * after reacquiring the hash bucket lock and before trying to
@@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1685 /* 1682 /*
1686 * pi_state is incorrect, some other task did a lock steal and 1683 * pi_state is incorrect, some other task did a lock steal and
1687 * we returned due to timeout or signal without taking the 1684 * we returned due to timeout or signal without taking the
1688 * rt_mutex. Too late. We can access the rt_mutex_owner without 1685 * rt_mutex. Too late.
1689 * locking, as the other task is now blocked on the hash bucket
1690 * lock. Fix the state up.
1691 */ 1686 */
1687 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1692 owner = rt_mutex_owner(&q->pi_state->pi_mutex); 1688 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1689 if (!owner)
1690 owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1691 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1693 ret = fixup_pi_state_owner(uaddr, q, owner); 1692 ret = fixup_pi_state_owner(uaddr, q, owner);
1694 goto out; 1693 goto out;
1695 } 1694 }
1696 1695
1697 /* 1696 /*
1698 * Paranoia check. If we did not take the lock, then we should not be 1697 * Paranoia check. If we did not take the lock, then we should not be
1699 * the owner, nor the pending owner, of the rt_mutex. 1698 * the owner of the rt_mutex.
1700 */ 1699 */
1701 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) 1700 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1702 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " 1701 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1781 * 1780 *
1782 * The basic logical guarantee of a futex is that it blocks ONLY 1781 * The basic logical guarantee of a futex is that it blocks ONLY
1783 * if cond(var) is known to be true at the time of blocking, for 1782 * if cond(var) is known to be true at the time of blocking, for
1784 * any cond. If we queued after testing *uaddr, that would open 1783 * any cond. If we locked the hash-bucket after testing *uaddr, that
1785 * a race condition where we could block indefinitely with 1784 * would open a race condition where we could block indefinitely with
1786 * cond(var) false, which would violate the guarantee. 1785 * cond(var) false, which would violate the guarantee.
1787 * 1786 *
1788 * A consequence is that futex_wait() can return zero and absorb 1787 * On the other hand, we insert q and release the hash-bucket only
1789 * a wakeup when *uaddr != val on entry to the syscall. This is 1788 * after testing *uaddr. This guarantees that futex_wait() will NOT
1790 * rare, but normal. 1789 * absorb a wakeup if *uaddr does not match the desired values
1790 * while the syscall executes.
1791 */ 1791 */
1792retry: 1792retry:
1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); 1793 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
@@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2046{ 2046{
2047 struct futex_hash_bucket *hb; 2047 struct futex_hash_bucket *hb;
2048 struct futex_q *this, *next; 2048 struct futex_q *this, *next;
2049 u32 uval;
2050 struct plist_head *head; 2049 struct plist_head *head;
2051 union futex_key key = FUTEX_KEY_INIT; 2050 union futex_key key = FUTEX_KEY_INIT;
2051 u32 uval, vpid = task_pid_vnr(current);
2052 int ret; 2052 int ret;
2053 2053
2054retry: 2054retry:
@@ -2057,7 +2057,7 @@ retry:
2057 /* 2057 /*
2058 * We release only a lock we actually own: 2058 * We release only a lock we actually own:
2059 */ 2059 */
2060 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2060 if ((uval & FUTEX_TID_MASK) != vpid)
2061 return -EPERM; 2061 return -EPERM;
2062 2062
2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); 2063 ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
@@ -2072,17 +2072,14 @@ retry:
2072 * again. If it succeeds then we can return without waking 2072 * again. If it succeeds then we can return without waking
2073 * anyone else up: 2073 * anyone else up:
2074 */ 2074 */
2075 if (!(uval & FUTEX_OWNER_DIED)) 2075 if (!(uval & FUTEX_OWNER_DIED) &&
2076 uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); 2076 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2077
2078
2079 if (unlikely(uval == -EFAULT))
2080 goto pi_faulted; 2077 goto pi_faulted;
2081 /* 2078 /*
2082 * Rare case: we managed to release the lock atomically, 2079 * Rare case: we managed to release the lock atomically,
2083 * no need to wake anyone else up: 2080 * no need to wake anyone else up:
2084 */ 2081 */
2085 if (unlikely(uval == task_pid_vnr(current))) 2082 if (unlikely(uval == vpid))
2086 goto out_unlock; 2083 goto out_unlock;
2087 2084
2088 /* 2085 /*
@@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2167 * We were woken prior to requeue by a timeout or a signal. 2164 * We were woken prior to requeue by a timeout or a signal.
2168 * Unqueue the futex_q and determine which it was. 2165 * Unqueue the futex_q and determine which it was.
2169 */ 2166 */
2170 plist_del(&q->list, &q->list.plist); 2167 plist_del(&q->list, &hb->chain);
2171 2168
2172 /* Handle spurious wakeups gracefully */ 2169 /* Handle spurious wakeups gracefully */
2173 ret = -EWOULDBLOCK; 2170 ret = -EWOULDBLOCK;
@@ -2463,11 +2460,20 @@ retry:
2463 * userspace. 2460 * userspace.
2464 */ 2461 */
2465 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2462 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2466 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); 2463 /*
2467 2464 * We are not holding a lock here, but we want to have
2468 if (nval == -EFAULT) 2465 * the pagefault_disable/enable() protection because
2469 return -1; 2466 * we want to handle the fault gracefully. If the
2470 2467 * access fails we try to fault in the futex with R/W
2468 * verification via get_user_pages. get_user() above
2469 * does not guarantee R/W access. If that fails we
2470 * give up and leave the futex locked.
2471 */
2472 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2473 if (fault_in_user_writeable(uaddr))
2474 return -1;
2475 goto retry;
2476 }
2471 if (nval != uval) 2477 if (nval != uval)
2472 goto retry; 2478 goto retry;
2473 2479
@@ -2678,8 +2684,7 @@ static int __init futex_init(void)
2678 * implementation, the non-functional ones will return 2684 * implementation, the non-functional ones will return
2679 * -ENOSYS. 2685 * -ENOSYS.
2680 */ 2686 */
2681 curval = cmpxchg_futex_value_locked(NULL, 0, 0); 2687 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2682 if (curval == -EFAULT)
2683 futex_cmpxchg_enabled = 1; 2688 futex_cmpxchg_enabled = 1;
2684 2689
2685 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2690 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0c8d7c048615..9017478c5d4c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -53,11 +53,10 @@
53/* 53/*
54 * The timer bases: 54 * The timer bases:
55 * 55 *
56 * Note: If we want to add new timer bases, we have to skip the two 56 * There are more clockids then hrtimer bases. Thus, we index
57 * clock ids captured by the cpu-timers. We do this by holding empty 57 * into the timer bases by the hrtimer_base_type enum. When trying
58 * entries rather than doing math adjustment of the clock ids. 58 * to reach a base using a clockid, hrtimer_clockid_to_base()
59 * This ensures that we capture erroneous accesses to these clock ids 59 * is used to convert from clockid to the proper hrtimer_base_type.
60 * rather than moving them into the range of valid clock id's.
61 */ 60 */
62DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 61DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
63{ 62{
@@ -74,30 +73,39 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
74 .get_time = &ktime_get, 73 .get_time = &ktime_get,
75 .resolution = KTIME_LOW_RES, 74 .resolution = KTIME_LOW_RES,
76 }, 75 },
76 {
77 .index = CLOCK_BOOTTIME,
78 .get_time = &ktime_get_boottime,
79 .resolution = KTIME_LOW_RES,
80 },
77 } 81 }
78}; 82};
79 83
84static int hrtimer_clock_to_base_table[MAX_CLOCKS];
85
86static inline int hrtimer_clockid_to_base(clockid_t clock_id)
87{
88 return hrtimer_clock_to_base_table[clock_id];
89}
90
91
80/* 92/*
81 * Get the coarse grained time at the softirq based on xtime and 93 * Get the coarse grained time at the softirq based on xtime and
82 * wall_to_monotonic. 94 * wall_to_monotonic.
83 */ 95 */
84static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) 96static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
85{ 97{
86 ktime_t xtim, tomono; 98 ktime_t xtim, mono, boot;
87 struct timespec xts, tom; 99 struct timespec xts, tom, slp;
88 unsigned long seq;
89 100
90 do { 101 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
91 seq = read_seqbegin(&xtime_lock);
92 xts = __current_kernel_time();
93 tom = __get_wall_to_monotonic();
94 } while (read_seqretry(&xtime_lock, seq));
95 102
96 xtim = timespec_to_ktime(xts); 103 xtim = timespec_to_ktime(xts);
97 tomono = timespec_to_ktime(tom); 104 mono = ktime_add(xtim, timespec_to_ktime(tom));
98 base->clock_base[CLOCK_REALTIME].softirq_time = xtim; 105 boot = ktime_add(mono, timespec_to_ktime(slp));
99 base->clock_base[CLOCK_MONOTONIC].softirq_time = 106 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
100 ktime_add(xtim, tomono); 107 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
108 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
101} 109}
102 110
103/* 111/*
@@ -184,10 +192,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
184 struct hrtimer_cpu_base *new_cpu_base; 192 struct hrtimer_cpu_base *new_cpu_base;
185 int this_cpu = smp_processor_id(); 193 int this_cpu = smp_processor_id();
186 int cpu = hrtimer_get_target(this_cpu, pinned); 194 int cpu = hrtimer_get_target(this_cpu, pinned);
195 int basenum = hrtimer_clockid_to_base(base->index);
187 196
188again: 197again:
189 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 198 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
190 new_base = &new_cpu_base->clock_base[base->index]; 199 new_base = &new_cpu_base->clock_base[basenum];
191 200
192 if (base != new_base) { 201 if (base != new_base) {
193 /* 202 /*
@@ -334,6 +343,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
334 343
335static struct debug_obj_descr hrtimer_debug_descr; 344static struct debug_obj_descr hrtimer_debug_descr;
336 345
346static void *hrtimer_debug_hint(void *addr)
347{
348 return ((struct hrtimer *) addr)->function;
349}
350
337/* 351/*
338 * fixup_init is called when: 352 * fixup_init is called when:
339 * - an active object is initialized 353 * - an active object is initialized
@@ -393,6 +407,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
393 407
394static struct debug_obj_descr hrtimer_debug_descr = { 408static struct debug_obj_descr hrtimer_debug_descr = {
395 .name = "hrtimer", 409 .name = "hrtimer",
410 .debug_hint = hrtimer_debug_hint,
396 .fixup_init = hrtimer_fixup_init, 411 .fixup_init = hrtimer_fixup_init,
397 .fixup_activate = hrtimer_fixup_activate, 412 .fixup_activate = hrtimer_fixup_activate,
398 .fixup_free = hrtimer_fixup_free, 413 .fixup_free = hrtimer_fixup_free,
@@ -611,24 +626,23 @@ static int hrtimer_reprogram(struct hrtimer *timer,
611static void retrigger_next_event(void *arg) 626static void retrigger_next_event(void *arg)
612{ 627{
613 struct hrtimer_cpu_base *base; 628 struct hrtimer_cpu_base *base;
614 struct timespec realtime_offset, wtm; 629 struct timespec realtime_offset, wtm, sleep;
615 unsigned long seq;
616 630
617 if (!hrtimer_hres_active()) 631 if (!hrtimer_hres_active())
618 return; 632 return;
619 633
620 do { 634 get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
621 seq = read_seqbegin(&xtime_lock); 635 &sleep);
622 wtm = __get_wall_to_monotonic();
623 } while (read_seqretry(&xtime_lock, seq));
624 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); 636 set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
625 637
626 base = &__get_cpu_var(hrtimer_bases); 638 base = &__get_cpu_var(hrtimer_bases);
627 639
628 /* Adjust CLOCK_REALTIME offset */ 640 /* Adjust CLOCK_REALTIME offset */
629 raw_spin_lock(&base->lock); 641 raw_spin_lock(&base->lock);
630 base->clock_base[CLOCK_REALTIME].offset = 642 base->clock_base[HRTIMER_BASE_REALTIME].offset =
631 timespec_to_ktime(realtime_offset); 643 timespec_to_ktime(realtime_offset);
644 base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
645 timespec_to_ktime(sleep);
632 646
633 hrtimer_force_reprogram(base, 0); 647 hrtimer_force_reprogram(base, 0);
634 raw_spin_unlock(&base->lock); 648 raw_spin_unlock(&base->lock);
@@ -673,14 +687,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
673} 687}
674 688
675/* 689/*
676 * Initialize the high resolution related parts of a hrtimer
677 */
678static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
679{
680}
681
682
683/*
684 * When High resolution timers are active, try to reprogram. Note, that in case 690 * When High resolution timers are active, try to reprogram. Note, that in case
685 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry 691 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
686 * check happens. The timer gets enqueued into the rbtree. The reprogramming 692 * check happens. The timer gets enqueued into the rbtree. The reprogramming
@@ -725,8 +731,9 @@ static int hrtimer_switch_to_hres(void)
725 return 0; 731 return 0;
726 } 732 }
727 base->hres_active = 1; 733 base->hres_active = 1;
728 base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; 734 base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
729 base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; 735 base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
736 base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
730 737
731 tick_setup_sched_timer(); 738 tick_setup_sched_timer();
732 739
@@ -750,7 +757,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
750 return 0; 757 return 0;
751} 758}
752static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 759static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
753static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
754 760
755#endif /* CONFIG_HIGH_RES_TIMERS */ 761#endif /* CONFIG_HIGH_RES_TIMERS */
756 762
@@ -1121,6 +1127,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1121 enum hrtimer_mode mode) 1127 enum hrtimer_mode mode)
1122{ 1128{
1123 struct hrtimer_cpu_base *cpu_base; 1129 struct hrtimer_cpu_base *cpu_base;
1130 int base;
1124 1131
1125 memset(timer, 0, sizeof(struct hrtimer)); 1132 memset(timer, 0, sizeof(struct hrtimer));
1126 1133
@@ -1129,8 +1136,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1129 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) 1136 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
1130 clock_id = CLOCK_MONOTONIC; 1137 clock_id = CLOCK_MONOTONIC;
1131 1138
1132 timer->base = &cpu_base->clock_base[clock_id]; 1139 base = hrtimer_clockid_to_base(clock_id);
1133 hrtimer_init_timer_hres(timer); 1140 timer->base = &cpu_base->clock_base[base];
1134 timerqueue_init(&timer->node); 1141 timerqueue_init(&timer->node);
1135 1142
1136#ifdef CONFIG_TIMER_STATS 1143#ifdef CONFIG_TIMER_STATS
@@ -1165,9 +1172,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
1165int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) 1172int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1166{ 1173{
1167 struct hrtimer_cpu_base *cpu_base; 1174 struct hrtimer_cpu_base *cpu_base;
1175 int base = hrtimer_clockid_to_base(which_clock);
1168 1176
1169 cpu_base = &__raw_get_cpu_var(hrtimer_bases); 1177 cpu_base = &__raw_get_cpu_var(hrtimer_bases);
1170 *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); 1178 *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
1171 1179
1172 return 0; 1180 return 0;
1173} 1181}
@@ -1714,6 +1722,10 @@ static struct notifier_block __cpuinitdata hrtimers_nb = {
1714 1722
1715void __init hrtimers_init(void) 1723void __init hrtimers_init(void)
1716{ 1724{
1725 hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME;
1726 hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC;
1727 hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME;
1728
1717 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1729 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1718 (void *)(long)smp_processor_id()); 1730 (void *)(long)smp_processor_id());
1719 register_cpu_notifier(&hrtimers_nb); 1731 register_cpu_notifier(&hrtimers_nb);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8e42fec7686d..09bef82d74cb 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,5 +1,6 @@
1# Select this to activate the generic irq options below
1config HAVE_GENERIC_HARDIRQS 2config HAVE_GENERIC_HARDIRQS
2 def_bool n 3 bool
3 4
4if HAVE_GENERIC_HARDIRQS 5if HAVE_GENERIC_HARDIRQS
5menu "IRQ subsystem" 6menu "IRQ subsystem"
@@ -11,26 +12,44 @@ config GENERIC_HARDIRQS
11 12
12# Select this to disable the deprecated stuff 13# Select this to disable the deprecated stuff
13config GENERIC_HARDIRQS_NO_DEPRECATED 14config GENERIC_HARDIRQS_NO_DEPRECATED
14 def_bool n 15 bool
16
17config GENERIC_HARDIRQS_NO_COMPAT
18 bool
15 19
16# Options selectable by the architecture code 20# Options selectable by the architecture code
21
22# Make sparse irq Kconfig switch below available
17config HAVE_SPARSE_IRQ 23config HAVE_SPARSE_IRQ
18 def_bool n 24 bool
19 25
26# Enable the generic irq autoprobe mechanism
20config GENERIC_IRQ_PROBE 27config GENERIC_IRQ_PROBE
21 def_bool n 28 bool
29
30# Use the generic /proc/interrupts implementation
31config GENERIC_IRQ_SHOW
32 bool
22 33
34# Support for delayed migration from interrupt context
23config GENERIC_PENDING_IRQ 35config GENERIC_PENDING_IRQ
24 def_bool n 36 bool
25 37
38# Alpha specific irq affinity mechanism
26config AUTO_IRQ_AFFINITY 39config AUTO_IRQ_AFFINITY
27 def_bool n 40 bool
28
29config IRQ_PER_CPU
30 def_bool n
31 41
42# Tasklet based software resend for pending interrupts on enable_irq()
32config HARDIRQS_SW_RESEND 43config HARDIRQS_SW_RESEND
33 def_bool n 44 bool
45
46# Preflow handler support for fasteoi (sparc64)
47config IRQ_PREFLOW_FASTEOI
48 bool
49
50# Support forced irq threading
51config IRQ_FORCED_THREADING
52 bool
34 53
35config SPARSE_IRQ 54config SPARSE_IRQ
36 bool "Support sparse irq numbering" 55 bool "Support sparse irq numbering"
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 505798f86c36..394784c57060 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -17,7 +17,7 @@
17/* 17/*
18 * Autodetection depends on the fact that any interrupt that 18 * Autodetection depends on the fact that any interrupt that
19 * comes in on to an unassigned handler will get stuck with 19 * comes in on to an unassigned handler will get stuck with
20 * "IRQ_WAITING" cleared and the interrupt disabled. 20 * "IRQS_WAITING" cleared and the interrupt disabled.
21 */ 21 */
22static DEFINE_MUTEX(probing_active); 22static DEFINE_MUTEX(probing_active);
23 23
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void)
32{ 32{
33 struct irq_desc *desc; 33 struct irq_desc *desc;
34 unsigned long mask = 0; 34 unsigned long mask = 0;
35 unsigned int status;
36 int i; 35 int i;
37 36
38 /* 37 /*
@@ -46,13 +45,7 @@ unsigned long probe_irq_on(void)
46 */ 45 */
47 for_each_irq_desc_reverse(i, desc) { 46 for_each_irq_desc_reverse(i, desc) {
48 raw_spin_lock_irq(&desc->lock); 47 raw_spin_lock_irq(&desc->lock);
49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 48 if (!desc->action && irq_settings_can_probe(desc)) {
50 /*
51 * An old-style architecture might still have
52 * the handle_bad_irq handler there:
53 */
54 compat_irq_chip_set_default_handler(desc);
55
56 /* 49 /*
57 * Some chips need to know about probing in 50 * Some chips need to know about probing in
58 * progress: 51 * progress:
@@ -60,7 +53,7 @@ unsigned long probe_irq_on(void)
60 if (desc->irq_data.chip->irq_set_type) 53 if (desc->irq_data.chip->irq_set_type)
61 desc->irq_data.chip->irq_set_type(&desc->irq_data, 54 desc->irq_data.chip->irq_set_type(&desc->irq_data,
62 IRQ_TYPE_PROBE); 55 IRQ_TYPE_PROBE);
63 desc->irq_data.chip->irq_startup(&desc->irq_data); 56 irq_startup(desc);
64 } 57 }
65 raw_spin_unlock_irq(&desc->lock); 58 raw_spin_unlock_irq(&desc->lock);
66 } 59 }
@@ -75,10 +68,12 @@ unsigned long probe_irq_on(void)
75 */ 68 */
76 for_each_irq_desc_reverse(i, desc) { 69 for_each_irq_desc_reverse(i, desc) {
77 raw_spin_lock_irq(&desc->lock); 70 raw_spin_lock_irq(&desc->lock);
78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 71 if (!desc->action && irq_settings_can_probe(desc)) {
79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
80 if (desc->irq_data.chip->irq_startup(&desc->irq_data)) 73 if (irq_startup(desc)) {
81 desc->status |= IRQ_PENDING; 74 irq_compat_set_pending(desc);
75 desc->istate |= IRQS_PENDING;
76 }
82 } 77 }
83 raw_spin_unlock_irq(&desc->lock); 78 raw_spin_unlock_irq(&desc->lock);
84 } 79 }
@@ -93,13 +88,12 @@ unsigned long probe_irq_on(void)
93 */ 88 */
94 for_each_irq_desc(i, desc) { 89 for_each_irq_desc(i, desc) {
95 raw_spin_lock_irq(&desc->lock); 90 raw_spin_lock_irq(&desc->lock);
96 status = desc->status;
97 91
98 if (status & IRQ_AUTODETECT) { 92 if (desc->istate & IRQS_AUTODETECT) {
99 /* It triggered already - consider it spurious. */ 93 /* It triggered already - consider it spurious. */
100 if (!(status & IRQ_WAITING)) { 94 if (!(desc->istate & IRQS_WAITING)) {
101 desc->status = status & ~IRQ_AUTODETECT; 95 desc->istate &= ~IRQS_AUTODETECT;
102 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 96 irq_shutdown(desc);
103 } else 97 } else
104 if (i < 32) 98 if (i < 32)
105 mask |= 1 << i; 99 mask |= 1 << i;
@@ -125,20 +119,18 @@ EXPORT_SYMBOL(probe_irq_on);
125 */ 119 */
126unsigned int probe_irq_mask(unsigned long val) 120unsigned int probe_irq_mask(unsigned long val)
127{ 121{
128 unsigned int status, mask = 0; 122 unsigned int mask = 0;
129 struct irq_desc *desc; 123 struct irq_desc *desc;
130 int i; 124 int i;
131 125
132 for_each_irq_desc(i, desc) { 126 for_each_irq_desc(i, desc) {
133 raw_spin_lock_irq(&desc->lock); 127 raw_spin_lock_irq(&desc->lock);
134 status = desc->status; 128 if (desc->istate & IRQS_AUTODETECT) {
135 129 if (i < 16 && !(desc->istate & IRQS_WAITING))
136 if (status & IRQ_AUTODETECT) {
137 if (i < 16 && !(status & IRQ_WAITING))
138 mask |= 1 << i; 130 mask |= 1 << i;
139 131
140 desc->status = status & ~IRQ_AUTODETECT; 132 desc->istate &= ~IRQS_AUTODETECT;
141 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 133 irq_shutdown(desc);
142 } 134 }
143 raw_spin_unlock_irq(&desc->lock); 135 raw_spin_unlock_irq(&desc->lock);
144 } 136 }
@@ -169,20 +161,18 @@ int probe_irq_off(unsigned long val)
169{ 161{
170 int i, irq_found = 0, nr_of_irqs = 0; 162 int i, irq_found = 0, nr_of_irqs = 0;
171 struct irq_desc *desc; 163 struct irq_desc *desc;
172 unsigned int status;
173 164
174 for_each_irq_desc(i, desc) { 165 for_each_irq_desc(i, desc) {
175 raw_spin_lock_irq(&desc->lock); 166 raw_spin_lock_irq(&desc->lock);
176 status = desc->status;
177 167
178 if (status & IRQ_AUTODETECT) { 168 if (desc->istate & IRQS_AUTODETECT) {
179 if (!(status & IRQ_WAITING)) { 169 if (!(desc->istate & IRQS_WAITING)) {
180 if (!nr_of_irqs) 170 if (!nr_of_irqs)
181 irq_found = i; 171 irq_found = i;
182 nr_of_irqs++; 172 nr_of_irqs++;
183 } 173 }
184 desc->status = status & ~IRQ_AUTODETECT; 174 desc->istate &= ~IRQS_AUTODETECT;
185 desc->irq_data.chip->irq_shutdown(&desc->irq_data); 175 irq_shutdown(desc);
186 } 176 }
187 raw_spin_unlock_irq(&desc->lock); 177 raw_spin_unlock_irq(&desc->lock);
188 } 178 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index baa5c4acad83..c9c0601f0615 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -19,140 +19,110 @@
19#include "internals.h" 19#include "internals.h"
20 20
21/** 21/**
22 * set_irq_chip - set the irq chip for an irq 22 * irq_set_chip - set the irq chip for an irq
23 * @irq: irq number 23 * @irq: irq number
24 * @chip: pointer to irq chip description structure 24 * @chip: pointer to irq chip description structure
25 */ 25 */
26int set_irq_chip(unsigned int irq, struct irq_chip *chip) 26int irq_set_chip(unsigned int irq, struct irq_chip *chip)
27{ 27{
28 struct irq_desc *desc = irq_to_desc(irq);
29 unsigned long flags; 28 unsigned long flags;
29 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
30 30
31 if (!desc) { 31 if (!desc)
32 WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
33 return -EINVAL; 32 return -EINVAL;
34 }
35 33
36 if (!chip) 34 if (!chip)
37 chip = &no_irq_chip; 35 chip = &no_irq_chip;
38 36
39 raw_spin_lock_irqsave(&desc->lock, flags);
40 irq_chip_set_defaults(chip); 37 irq_chip_set_defaults(chip);
41 desc->irq_data.chip = chip; 38 desc->irq_data.chip = chip;
42 raw_spin_unlock_irqrestore(&desc->lock, flags); 39 irq_put_desc_unlock(desc, flags);
43
44 return 0; 40 return 0;
45} 41}
46EXPORT_SYMBOL(set_irq_chip); 42EXPORT_SYMBOL(irq_set_chip);
47 43
48/** 44/**
49 * set_irq_type - set the irq trigger type for an irq 45 * irq_set_type - set the irq trigger type for an irq
50 * @irq: irq number 46 * @irq: irq number
51 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h 47 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
52 */ 48 */
53int set_irq_type(unsigned int irq, unsigned int type) 49int irq_set_irq_type(unsigned int irq, unsigned int type)
54{ 50{
55 struct irq_desc *desc = irq_to_desc(irq);
56 unsigned long flags; 51 unsigned long flags;
57 int ret = -ENXIO; 52 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
53 int ret = 0;
58 54
59 if (!desc) { 55 if (!desc)
60 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); 56 return -EINVAL;
61 return -ENODEV;
62 }
63 57
64 type &= IRQ_TYPE_SENSE_MASK; 58 type &= IRQ_TYPE_SENSE_MASK;
65 if (type == IRQ_TYPE_NONE) 59 if (type != IRQ_TYPE_NONE)
66 return 0; 60 ret = __irq_set_trigger(desc, irq, type);
67 61 irq_put_desc_busunlock(desc, flags);
68 raw_spin_lock_irqsave(&desc->lock, flags);
69 ret = __irq_set_trigger(desc, irq, type);
70 raw_spin_unlock_irqrestore(&desc->lock, flags);
71 return ret; 62 return ret;
72} 63}
73EXPORT_SYMBOL(set_irq_type); 64EXPORT_SYMBOL(irq_set_irq_type);
74 65
75/** 66/**
76 * set_irq_data - set irq type data for an irq 67 * irq_set_handler_data - set irq handler data for an irq
77 * @irq: Interrupt number 68 * @irq: Interrupt number
78 * @data: Pointer to interrupt specific data 69 * @data: Pointer to interrupt specific data
79 * 70 *
80 * Set the hardware irq controller data for an irq 71 * Set the hardware irq controller data for an irq
81 */ 72 */
82int set_irq_data(unsigned int irq, void *data) 73int irq_set_handler_data(unsigned int irq, void *data)
83{ 74{
84 struct irq_desc *desc = irq_to_desc(irq);
85 unsigned long flags; 75 unsigned long flags;
76 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
86 77
87 if (!desc) { 78 if (!desc)
88 printk(KERN_ERR
89 "Trying to install controller data for IRQ%d\n", irq);
90 return -EINVAL; 79 return -EINVAL;
91 }
92
93 raw_spin_lock_irqsave(&desc->lock, flags);
94 desc->irq_data.handler_data = data; 80 desc->irq_data.handler_data = data;
95 raw_spin_unlock_irqrestore(&desc->lock, flags); 81 irq_put_desc_unlock(desc, flags);
96 return 0; 82 return 0;
97} 83}
98EXPORT_SYMBOL(set_irq_data); 84EXPORT_SYMBOL(irq_set_handler_data);
99 85
100/** 86/**
101 * set_irq_msi - set MSI descriptor data for an irq 87 * irq_set_msi_desc - set MSI descriptor data for an irq
102 * @irq: Interrupt number 88 * @irq: Interrupt number
103 * @entry: Pointer to MSI descriptor data 89 * @entry: Pointer to MSI descriptor data
104 * 90 *
105 * Set the MSI descriptor entry for an irq 91 * Set the MSI descriptor entry for an irq
106 */ 92 */
107int set_irq_msi(unsigned int irq, struct msi_desc *entry) 93int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
108{ 94{
109 struct irq_desc *desc = irq_to_desc(irq);
110 unsigned long flags; 95 unsigned long flags;
96 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
111 97
112 if (!desc) { 98 if (!desc)
113 printk(KERN_ERR
114 "Trying to install msi data for IRQ%d\n", irq);
115 return -EINVAL; 99 return -EINVAL;
116 }
117
118 raw_spin_lock_irqsave(&desc->lock, flags);
119 desc->irq_data.msi_desc = entry; 100 desc->irq_data.msi_desc = entry;
120 if (entry) 101 if (entry)
121 entry->irq = irq; 102 entry->irq = irq;
122 raw_spin_unlock_irqrestore(&desc->lock, flags); 103 irq_put_desc_unlock(desc, flags);
123 return 0; 104 return 0;
124} 105}
125 106
126/** 107/**
127 * set_irq_chip_data - set irq chip data for an irq 108 * irq_set_chip_data - set irq chip data for an irq
128 * @irq: Interrupt number 109 * @irq: Interrupt number
129 * @data: Pointer to chip specific data 110 * @data: Pointer to chip specific data
130 * 111 *
131 * Set the hardware irq chip data for an irq 112 * Set the hardware irq chip data for an irq
132 */ 113 */
133int set_irq_chip_data(unsigned int irq, void *data) 114int irq_set_chip_data(unsigned int irq, void *data)
134{ 115{
135 struct irq_desc *desc = irq_to_desc(irq);
136 unsigned long flags; 116 unsigned long flags;
117 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
137 118
138 if (!desc) { 119 if (!desc)
139 printk(KERN_ERR
140 "Trying to install chip data for IRQ%d\n", irq);
141 return -EINVAL;
142 }
143
144 if (!desc->irq_data.chip) {
145 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
146 return -EINVAL; 120 return -EINVAL;
147 }
148
149 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->irq_data.chip_data = data; 121 desc->irq_data.chip_data = data;
151 raw_spin_unlock_irqrestore(&desc->lock, flags); 122 irq_put_desc_unlock(desc, flags);
152
153 return 0; 123 return 0;
154} 124}
155EXPORT_SYMBOL(set_irq_chip_data); 125EXPORT_SYMBOL(irq_set_chip_data);
156 126
157struct irq_data *irq_get_irq_data(unsigned int irq) 127struct irq_data *irq_get_irq_data(unsigned int irq)
158{ 128{
@@ -162,72 +132,75 @@ struct irq_data *irq_get_irq_data(unsigned int irq)
162} 132}
163EXPORT_SYMBOL_GPL(irq_get_irq_data); 133EXPORT_SYMBOL_GPL(irq_get_irq_data);
164 134
165/** 135static void irq_state_clr_disabled(struct irq_desc *desc)
166 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
167 *
168 * @irq: Interrupt number
169 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
170 *
171 * The IRQ_NESTED_THREAD flag indicates that on
172 * request_threaded_irq() no separate interrupt thread should be
173 * created for the irq as the handler are called nested in the
174 * context of a demultiplexing interrupt handler thread.
175 */
176void set_irq_nested_thread(unsigned int irq, int nest)
177{ 136{
178 struct irq_desc *desc = irq_to_desc(irq); 137 desc->istate &= ~IRQS_DISABLED;
179 unsigned long flags; 138 irq_compat_clr_disabled(desc);
180
181 if (!desc)
182 return;
183
184 raw_spin_lock_irqsave(&desc->lock, flags);
185 if (nest)
186 desc->status |= IRQ_NESTED_THREAD;
187 else
188 desc->status &= ~IRQ_NESTED_THREAD;
189 raw_spin_unlock_irqrestore(&desc->lock, flags);
190} 139}
191EXPORT_SYMBOL_GPL(set_irq_nested_thread);
192 140
193/* 141static void irq_state_set_disabled(struct irq_desc *desc)
194 * default enable function
195 */
196static void default_enable(struct irq_data *data)
197{ 142{
198 struct irq_desc *desc = irq_data_to_desc(data); 143 desc->istate |= IRQS_DISABLED;
144 irq_compat_set_disabled(desc);
145}
199 146
200 desc->irq_data.chip->irq_unmask(&desc->irq_data); 147static void irq_state_clr_masked(struct irq_desc *desc)
201 desc->status &= ~IRQ_MASKED; 148{
149 desc->istate &= ~IRQS_MASKED;
150 irq_compat_clr_masked(desc);
202} 151}
203 152
204/* 153static void irq_state_set_masked(struct irq_desc *desc)
205 * default disable function
206 */
207static void default_disable(struct irq_data *data)
208{ 154{
155 desc->istate |= IRQS_MASKED;
156 irq_compat_set_masked(desc);
209} 157}
210 158
211/* 159int irq_startup(struct irq_desc *desc)
212 * default startup function
213 */
214static unsigned int default_startup(struct irq_data *data)
215{ 160{
216 struct irq_desc *desc = irq_data_to_desc(data); 161 irq_state_clr_disabled(desc);
162 desc->depth = 0;
163
164 if (desc->irq_data.chip->irq_startup) {
165 int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
166 irq_state_clr_masked(desc);
167 return ret;
168 }
217 169
218 desc->irq_data.chip->irq_enable(data); 170 irq_enable(desc);
219 return 0; 171 return 0;
220} 172}
221 173
222/* 174void irq_shutdown(struct irq_desc *desc)
223 * default shutdown function
224 */
225static void default_shutdown(struct irq_data *data)
226{ 175{
227 struct irq_desc *desc = irq_data_to_desc(data); 176 irq_state_set_disabled(desc);
177 desc->depth = 1;
178 if (desc->irq_data.chip->irq_shutdown)
179 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
180 if (desc->irq_data.chip->irq_disable)
181 desc->irq_data.chip->irq_disable(&desc->irq_data);
182 else
183 desc->irq_data.chip->irq_mask(&desc->irq_data);
184 irq_state_set_masked(desc);
185}
228 186
229 desc->irq_data.chip->irq_mask(&desc->irq_data); 187void irq_enable(struct irq_desc *desc)
230 desc->status |= IRQ_MASKED; 188{
189 irq_state_clr_disabled(desc);
190 if (desc->irq_data.chip->irq_enable)
191 desc->irq_data.chip->irq_enable(&desc->irq_data);
192 else
193 desc->irq_data.chip->irq_unmask(&desc->irq_data);
194 irq_state_clr_masked(desc);
195}
196
197void irq_disable(struct irq_desc *desc)
198{
199 irq_state_set_disabled(desc);
200 if (desc->irq_data.chip->irq_disable) {
201 desc->irq_data.chip->irq_disable(&desc->irq_data);
202 irq_state_set_masked(desc);
203 }
231} 204}
232 205
233#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED 206#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
@@ -315,10 +288,6 @@ static void compat_bus_sync_unlock(struct irq_data *data)
315void irq_chip_set_defaults(struct irq_chip *chip) 288void irq_chip_set_defaults(struct irq_chip *chip)
316{ 289{
317#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED 290#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
318 /*
319 * Compat fixup functions need to be before we set the
320 * defaults for enable/disable/startup/shutdown
321 */
322 if (chip->enable) 291 if (chip->enable)
323 chip->irq_enable = compat_irq_enable; 292 chip->irq_enable = compat_irq_enable;
324 if (chip->disable) 293 if (chip->disable)
@@ -327,33 +296,8 @@ void irq_chip_set_defaults(struct irq_chip *chip)
327 chip->irq_shutdown = compat_irq_shutdown; 296 chip->irq_shutdown = compat_irq_shutdown;
328 if (chip->startup) 297 if (chip->startup)
329 chip->irq_startup = compat_irq_startup; 298 chip->irq_startup = compat_irq_startup;
330#endif
331 /*
332 * The real defaults
333 */
334 if (!chip->irq_enable)
335 chip->irq_enable = default_enable;
336 if (!chip->irq_disable)
337 chip->irq_disable = default_disable;
338 if (!chip->irq_startup)
339 chip->irq_startup = default_startup;
340 /*
341 * We use chip->irq_disable, when the user provided its own. When
342 * we have default_disable set for chip->irq_disable, then we need
343 * to use default_shutdown, otherwise the irq line is not
344 * disabled on free_irq():
345 */
346 if (!chip->irq_shutdown)
347 chip->irq_shutdown = chip->irq_disable != default_disable ?
348 chip->irq_disable : default_shutdown;
349
350#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
351 if (!chip->end) 299 if (!chip->end)
352 chip->end = dummy_irq_chip.end; 300 chip->end = dummy_irq_chip.end;
353
354 /*
355 * Now fix up the remaining compat handlers
356 */
357 if (chip->bus_lock) 301 if (chip->bus_lock)
358 chip->irq_bus_lock = compat_bus_lock; 302 chip->irq_bus_lock = compat_bus_lock;
359 if (chip->bus_sync_unlock) 303 if (chip->bus_sync_unlock)
@@ -388,22 +332,22 @@ static inline void mask_ack_irq(struct irq_desc *desc)
388 if (desc->irq_data.chip->irq_ack) 332 if (desc->irq_data.chip->irq_ack)
389 desc->irq_data.chip->irq_ack(&desc->irq_data); 333 desc->irq_data.chip->irq_ack(&desc->irq_data);
390 } 334 }
391 desc->status |= IRQ_MASKED; 335 irq_state_set_masked(desc);
392} 336}
393 337
394static inline void mask_irq(struct irq_desc *desc) 338void mask_irq(struct irq_desc *desc)
395{ 339{
396 if (desc->irq_data.chip->irq_mask) { 340 if (desc->irq_data.chip->irq_mask) {
397 desc->irq_data.chip->irq_mask(&desc->irq_data); 341 desc->irq_data.chip->irq_mask(&desc->irq_data);
398 desc->status |= IRQ_MASKED; 342 irq_state_set_masked(desc);
399 } 343 }
400} 344}
401 345
402static inline void unmask_irq(struct irq_desc *desc) 346void unmask_irq(struct irq_desc *desc)
403{ 347{
404 if (desc->irq_data.chip->irq_unmask) { 348 if (desc->irq_data.chip->irq_unmask) {
405 desc->irq_data.chip->irq_unmask(&desc->irq_data); 349 desc->irq_data.chip->irq_unmask(&desc->irq_data);
406 desc->status &= ~IRQ_MASKED; 350 irq_state_clr_masked(desc);
407 } 351 }
408} 352}
409 353
@@ -428,10 +372,11 @@ void handle_nested_irq(unsigned int irq)
428 kstat_incr_irqs_this_cpu(irq, desc); 372 kstat_incr_irqs_this_cpu(irq, desc);
429 373
430 action = desc->action; 374 action = desc->action;
431 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 375 if (unlikely(!action || (desc->istate & IRQS_DISABLED)))
432 goto out_unlock; 376 goto out_unlock;
433 377
434 desc->status |= IRQ_INPROGRESS; 378 irq_compat_set_progress(desc);
379 desc->istate |= IRQS_INPROGRESS;
435 raw_spin_unlock_irq(&desc->lock); 380 raw_spin_unlock_irq(&desc->lock);
436 381
437 action_ret = action->thread_fn(action->irq, action->dev_id); 382 action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -439,13 +384,21 @@ void handle_nested_irq(unsigned int irq)
439 note_interrupt(irq, desc, action_ret); 384 note_interrupt(irq, desc, action_ret);
440 385
441 raw_spin_lock_irq(&desc->lock); 386 raw_spin_lock_irq(&desc->lock);
442 desc->status &= ~IRQ_INPROGRESS; 387 desc->istate &= ~IRQS_INPROGRESS;
388 irq_compat_clr_progress(desc);
443 389
444out_unlock: 390out_unlock:
445 raw_spin_unlock_irq(&desc->lock); 391 raw_spin_unlock_irq(&desc->lock);
446} 392}
447EXPORT_SYMBOL_GPL(handle_nested_irq); 393EXPORT_SYMBOL_GPL(handle_nested_irq);
448 394
395static bool irq_check_poll(struct irq_desc *desc)
396{
397 if (!(desc->istate & IRQS_POLL_INPROGRESS))
398 return false;
399 return irq_wait_for_poll(desc);
400}
401
449/** 402/**
450 * handle_simple_irq - Simple and software-decoded IRQs. 403 * handle_simple_irq - Simple and software-decoded IRQs.
451 * @irq: the interrupt number 404 * @irq: the interrupt number
@@ -461,29 +414,20 @@ EXPORT_SYMBOL_GPL(handle_nested_irq);
461void 414void
462handle_simple_irq(unsigned int irq, struct irq_desc *desc) 415handle_simple_irq(unsigned int irq, struct irq_desc *desc)
463{ 416{
464 struct irqaction *action;
465 irqreturn_t action_ret;
466
467 raw_spin_lock(&desc->lock); 417 raw_spin_lock(&desc->lock);
468 418
469 if (unlikely(desc->status & IRQ_INPROGRESS)) 419 if (unlikely(desc->istate & IRQS_INPROGRESS))
470 goto out_unlock; 420 if (!irq_check_poll(desc))
471 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 421 goto out_unlock;
422
423 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
472 kstat_incr_irqs_this_cpu(irq, desc); 424 kstat_incr_irqs_this_cpu(irq, desc);
473 425
474 action = desc->action; 426 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
475 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
476 goto out_unlock; 427 goto out_unlock;
477 428
478 desc->status |= IRQ_INPROGRESS; 429 handle_irq_event(desc);
479 raw_spin_unlock(&desc->lock);
480 430
481 action_ret = handle_IRQ_event(irq, action);
482 if (!noirqdebug)
483 note_interrupt(irq, desc, action_ret);
484
485 raw_spin_lock(&desc->lock);
486 desc->status &= ~IRQ_INPROGRESS;
487out_unlock: 431out_unlock:
488 raw_spin_unlock(&desc->lock); 432 raw_spin_unlock(&desc->lock);
489} 433}
@@ -501,42 +445,42 @@ out_unlock:
501void 445void
502handle_level_irq(unsigned int irq, struct irq_desc *desc) 446handle_level_irq(unsigned int irq, struct irq_desc *desc)
503{ 447{
504 struct irqaction *action;
505 irqreturn_t action_ret;
506
507 raw_spin_lock(&desc->lock); 448 raw_spin_lock(&desc->lock);
508 mask_ack_irq(desc); 449 mask_ack_irq(desc);
509 450
510 if (unlikely(desc->status & IRQ_INPROGRESS)) 451 if (unlikely(desc->istate & IRQS_INPROGRESS))
511 goto out_unlock; 452 if (!irq_check_poll(desc))
512 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 453 goto out_unlock;
454
455 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
513 kstat_incr_irqs_this_cpu(irq, desc); 456 kstat_incr_irqs_this_cpu(irq, desc);
514 457
515 /* 458 /*
516 * If its disabled or no action available 459 * If its disabled or no action available
517 * keep it masked and get out of here 460 * keep it masked and get out of here
518 */ 461 */
519 action = desc->action; 462 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED)))
520 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
521 goto out_unlock; 463 goto out_unlock;
522 464
523 desc->status |= IRQ_INPROGRESS; 465 handle_irq_event(desc);
524 raw_spin_unlock(&desc->lock);
525
526 action_ret = handle_IRQ_event(irq, action);
527 if (!noirqdebug)
528 note_interrupt(irq, desc, action_ret);
529 466
530 raw_spin_lock(&desc->lock); 467 if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT)))
531 desc->status &= ~IRQ_INPROGRESS;
532
533 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
534 unmask_irq(desc); 468 unmask_irq(desc);
535out_unlock: 469out_unlock:
536 raw_spin_unlock(&desc->lock); 470 raw_spin_unlock(&desc->lock);
537} 471}
538EXPORT_SYMBOL_GPL(handle_level_irq); 472EXPORT_SYMBOL_GPL(handle_level_irq);
539 473
474#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
475static inline void preflow_handler(struct irq_desc *desc)
476{
477 if (desc->preflow_handler)
478 desc->preflow_handler(&desc->irq_data);
479}
480#else
481static inline void preflow_handler(struct irq_desc *desc) { }
482#endif
483
540/** 484/**
541 * handle_fasteoi_irq - irq handler for transparent controllers 485 * handle_fasteoi_irq - irq handler for transparent controllers
542 * @irq: the interrupt number 486 * @irq: the interrupt number
@@ -550,42 +494,41 @@ EXPORT_SYMBOL_GPL(handle_level_irq);
550void 494void
551handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) 495handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
552{ 496{
553 struct irqaction *action;
554 irqreturn_t action_ret;
555
556 raw_spin_lock(&desc->lock); 497 raw_spin_lock(&desc->lock);
557 498
558 if (unlikely(desc->status & IRQ_INPROGRESS)) 499 if (unlikely(desc->istate & IRQS_INPROGRESS))
559 goto out; 500 if (!irq_check_poll(desc))
501 goto out;
560 502
561 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 503 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
562 kstat_incr_irqs_this_cpu(irq, desc); 504 kstat_incr_irqs_this_cpu(irq, desc);
563 505
564 /* 506 /*
565 * If its disabled or no action available 507 * If its disabled or no action available
566 * then mask it and get out of here: 508 * then mask it and get out of here:
567 */ 509 */
568 action = desc->action; 510 if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) {
569 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 511 irq_compat_set_pending(desc);
570 desc->status |= IRQ_PENDING; 512 desc->istate |= IRQS_PENDING;
571 mask_irq(desc); 513 mask_irq(desc);
572 goto out; 514 goto out;
573 } 515 }
574 516
575 desc->status |= IRQ_INPROGRESS; 517 if (desc->istate & IRQS_ONESHOT)
576 desc->status &= ~IRQ_PENDING; 518 mask_irq(desc);
577 raw_spin_unlock(&desc->lock);
578 519
579 action_ret = handle_IRQ_event(irq, action); 520 preflow_handler(desc);
580 if (!noirqdebug) 521 handle_irq_event(desc);
581 note_interrupt(irq, desc, action_ret);
582 522
583 raw_spin_lock(&desc->lock); 523out_eoi:
584 desc->status &= ~IRQ_INPROGRESS;
585out:
586 desc->irq_data.chip->irq_eoi(&desc->irq_data); 524 desc->irq_data.chip->irq_eoi(&desc->irq_data);
587 525out_unlock:
588 raw_spin_unlock(&desc->lock); 526 raw_spin_unlock(&desc->lock);
527 return;
528out:
529 if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
530 goto out_eoi;
531 goto out_unlock;
589} 532}
590 533
591/** 534/**
@@ -609,32 +552,28 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
609{ 552{
610 raw_spin_lock(&desc->lock); 553 raw_spin_lock(&desc->lock);
611 554
612 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 555 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
613
614 /* 556 /*
615 * If we're currently running this IRQ, or its disabled, 557 * If we're currently running this IRQ, or its disabled,
616 * we shouldn't process the IRQ. Mark it pending, handle 558 * we shouldn't process the IRQ. Mark it pending, handle
617 * the necessary masking and go out 559 * the necessary masking and go out
618 */ 560 */
619 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || 561 if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) ||
620 !desc->action)) { 562 !desc->action))) {
621 desc->status |= (IRQ_PENDING | IRQ_MASKED); 563 if (!irq_check_poll(desc)) {
622 mask_ack_irq(desc); 564 irq_compat_set_pending(desc);
623 goto out_unlock; 565 desc->istate |= IRQS_PENDING;
566 mask_ack_irq(desc);
567 goto out_unlock;
568 }
624 } 569 }
625 kstat_incr_irqs_this_cpu(irq, desc); 570 kstat_incr_irqs_this_cpu(irq, desc);
626 571
627 /* Start handling the irq */ 572 /* Start handling the irq */
628 desc->irq_data.chip->irq_ack(&desc->irq_data); 573 desc->irq_data.chip->irq_ack(&desc->irq_data);
629 574
630 /* Mark the IRQ currently in progress.*/
631 desc->status |= IRQ_INPROGRESS;
632
633 do { 575 do {
634 struct irqaction *action = desc->action; 576 if (unlikely(!desc->action)) {
635 irqreturn_t action_ret;
636
637 if (unlikely(!action)) {
638 mask_irq(desc); 577 mask_irq(desc);
639 goto out_unlock; 578 goto out_unlock;
640 } 579 }
@@ -644,22 +583,17 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
644 * one, we could have masked the irq. 583 * one, we could have masked the irq.
645 * Renable it, if it was not disabled in meantime. 584 * Renable it, if it was not disabled in meantime.
646 */ 585 */
647 if (unlikely((desc->status & 586 if (unlikely(desc->istate & IRQS_PENDING)) {
648 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 587 if (!(desc->istate & IRQS_DISABLED) &&
649 (IRQ_PENDING | IRQ_MASKED))) { 588 (desc->istate & IRQS_MASKED))
650 unmask_irq(desc); 589 unmask_irq(desc);
651 } 590 }
652 591
653 desc->status &= ~IRQ_PENDING; 592 handle_irq_event(desc);
654 raw_spin_unlock(&desc->lock);
655 action_ret = handle_IRQ_event(irq, action);
656 if (!noirqdebug)
657 note_interrupt(irq, desc, action_ret);
658 raw_spin_lock(&desc->lock);
659 593
660 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); 594 } while ((desc->istate & IRQS_PENDING) &&
595 !(desc->istate & IRQS_DISABLED));
661 596
662 desc->status &= ~IRQ_INPROGRESS;
663out_unlock: 597out_unlock:
664 raw_spin_unlock(&desc->lock); 598 raw_spin_unlock(&desc->lock);
665} 599}
@@ -674,103 +608,84 @@ out_unlock:
674void 608void
675handle_percpu_irq(unsigned int irq, struct irq_desc *desc) 609handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
676{ 610{
677 irqreturn_t action_ret; 611 struct irq_chip *chip = irq_desc_get_chip(desc);
678 612
679 kstat_incr_irqs_this_cpu(irq, desc); 613 kstat_incr_irqs_this_cpu(irq, desc);
680 614
681 if (desc->irq_data.chip->irq_ack) 615 if (chip->irq_ack)
682 desc->irq_data.chip->irq_ack(&desc->irq_data); 616 chip->irq_ack(&desc->irq_data);
683 617
684 action_ret = handle_IRQ_event(irq, desc->action); 618 handle_irq_event_percpu(desc, desc->action);
685 if (!noirqdebug)
686 note_interrupt(irq, desc, action_ret);
687 619
688 if (desc->irq_data.chip->irq_eoi) 620 if (chip->irq_eoi)
689 desc->irq_data.chip->irq_eoi(&desc->irq_data); 621 chip->irq_eoi(&desc->irq_data);
690} 622}
691 623
692void 624void
693__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 625__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
694 const char *name) 626 const char *name)
695{ 627{
696 struct irq_desc *desc = irq_to_desc(irq);
697 unsigned long flags; 628 unsigned long flags;
629 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
698 630
699 if (!desc) { 631 if (!desc)
700 printk(KERN_ERR
701 "Trying to install type control for IRQ%d\n", irq);
702 return; 632 return;
703 }
704 633
705 if (!handle) 634 if (!handle) {
706 handle = handle_bad_irq; 635 handle = handle_bad_irq;
707 else if (desc->irq_data.chip == &no_irq_chip) { 636 } else {
708 printk(KERN_WARNING "Trying to install %sinterrupt handler " 637 if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
709 "for IRQ%d\n", is_chained ? "chained " : "", irq); 638 goto out;
710 /*
711 * Some ARM implementations install a handler for really dumb
712 * interrupt hardware without setting an irq_chip. This worked
713 * with the ARM no_irq_chip but the check in setup_irq would
714 * prevent us to setup the interrupt at all. Switch it to
715 * dummy_irq_chip for easy transition.
716 */
717 desc->irq_data.chip = &dummy_irq_chip;
718 } 639 }
719 640
720 chip_bus_lock(desc);
721 raw_spin_lock_irqsave(&desc->lock, flags);
722
723 /* Uninstall? */ 641 /* Uninstall? */
724 if (handle == handle_bad_irq) { 642 if (handle == handle_bad_irq) {
725 if (desc->irq_data.chip != &no_irq_chip) 643 if (desc->irq_data.chip != &no_irq_chip)
726 mask_ack_irq(desc); 644 mask_ack_irq(desc);
727 desc->status |= IRQ_DISABLED; 645 irq_compat_set_disabled(desc);
646 desc->istate |= IRQS_DISABLED;
728 desc->depth = 1; 647 desc->depth = 1;
729 } 648 }
730 desc->handle_irq = handle; 649 desc->handle_irq = handle;
731 desc->name = name; 650 desc->name = name;
732 651
733 if (handle != handle_bad_irq && is_chained) { 652 if (handle != handle_bad_irq && is_chained) {
734 desc->status &= ~IRQ_DISABLED; 653 irq_settings_set_noprobe(desc);
735 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 654 irq_settings_set_norequest(desc);
736 desc->depth = 0; 655 irq_startup(desc);
737 desc->irq_data.chip->irq_startup(&desc->irq_data);
738 } 656 }
739 raw_spin_unlock_irqrestore(&desc->lock, flags); 657out:
740 chip_bus_sync_unlock(desc); 658 irq_put_desc_busunlock(desc, flags);
741}
742EXPORT_SYMBOL_GPL(__set_irq_handler);
743
744void
745set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
746 irq_flow_handler_t handle)
747{
748 set_irq_chip(irq, chip);
749 __set_irq_handler(irq, handle, 0, NULL);
750} 659}
660EXPORT_SYMBOL_GPL(__irq_set_handler);
751 661
752void 662void
753set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, 663irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
754 irq_flow_handler_t handle, const char *name) 664 irq_flow_handler_t handle, const char *name)
755{ 665{
756 set_irq_chip(irq, chip); 666 irq_set_chip(irq, chip);
757 __set_irq_handler(irq, handle, 0, name); 667 __irq_set_handler(irq, handle, 0, name);
758} 668}
759 669
760void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) 670void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
761{ 671{
762 struct irq_desc *desc = irq_to_desc(irq);
763 unsigned long flags; 672 unsigned long flags;
673 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
764 674
765 if (!desc) 675 if (!desc)
766 return; 676 return;
677 irq_settings_clr_and_set(desc, clr, set);
678
679 irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
680 IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
681 if (irq_settings_has_no_balance_set(desc))
682 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
683 if (irq_settings_is_per_cpu(desc))
684 irqd_set(&desc->irq_data, IRQD_PER_CPU);
685 if (irq_settings_can_move_pcntxt(desc))
686 irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
767 687
768 /* Sanitize flags */ 688 irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
769 set &= IRQF_MODIFY_MASK;
770 clr &= IRQF_MODIFY_MASK;
771 689
772 raw_spin_lock_irqsave(&desc->lock, flags); 690 irq_put_desc_unlock(desc, flags);
773 desc->status &= ~clr;
774 desc->status |= set;
775 raw_spin_unlock_irqrestore(&desc->lock, flags);
776} 691}
diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h
new file mode 100644
index 000000000000..6bbaf66aca85
--- /dev/null
+++ b/kernel/irq/compat.h
@@ -0,0 +1,72 @@
1/*
2 * Compat layer for transition period
3 */
4#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
5static inline void irq_compat_set_progress(struct irq_desc *desc)
6{
7 desc->status |= IRQ_INPROGRESS;
8}
9
10static inline void irq_compat_clr_progress(struct irq_desc *desc)
11{
12 desc->status &= ~IRQ_INPROGRESS;
13}
14static inline void irq_compat_set_disabled(struct irq_desc *desc)
15{
16 desc->status |= IRQ_DISABLED;
17}
18static inline void irq_compat_clr_disabled(struct irq_desc *desc)
19{
20 desc->status &= ~IRQ_DISABLED;
21}
22static inline void irq_compat_set_pending(struct irq_desc *desc)
23{
24 desc->status |= IRQ_PENDING;
25}
26
27static inline void irq_compat_clr_pending(struct irq_desc *desc)
28{
29 desc->status &= ~IRQ_PENDING;
30}
31static inline void irq_compat_set_masked(struct irq_desc *desc)
32{
33 desc->status |= IRQ_MASKED;
34}
35
36static inline void irq_compat_clr_masked(struct irq_desc *desc)
37{
38 desc->status &= ~IRQ_MASKED;
39}
40static inline void irq_compat_set_move_pending(struct irq_desc *desc)
41{
42 desc->status |= IRQ_MOVE_PENDING;
43}
44
45static inline void irq_compat_clr_move_pending(struct irq_desc *desc)
46{
47 desc->status &= ~IRQ_MOVE_PENDING;
48}
49static inline void irq_compat_set_affinity(struct irq_desc *desc)
50{
51 desc->status |= IRQ_AFFINITY_SET;
52}
53
54static inline void irq_compat_clr_affinity(struct irq_desc *desc)
55{
56 desc->status &= ~IRQ_AFFINITY_SET;
57}
58#else
59static inline void irq_compat_set_progress(struct irq_desc *desc) { }
60static inline void irq_compat_clr_progress(struct irq_desc *desc) { }
61static inline void irq_compat_set_disabled(struct irq_desc *desc) { }
62static inline void irq_compat_clr_disabled(struct irq_desc *desc) { }
63static inline void irq_compat_set_pending(struct irq_desc *desc) { }
64static inline void irq_compat_clr_pending(struct irq_desc *desc) { }
65static inline void irq_compat_set_masked(struct irq_desc *desc) { }
66static inline void irq_compat_clr_masked(struct irq_desc *desc) { }
67static inline void irq_compat_set_move_pending(struct irq_desc *desc) { }
68static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { }
69static inline void irq_compat_set_affinity(struct irq_desc *desc) { }
70static inline void irq_compat_clr_affinity(struct irq_desc *desc) { }
71#endif
72
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 000000000000..d1a33b7fa61d
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,40 @@
1/*
2 * Debugging printout:
3 */
4
5#include <linux/kallsyms.h>
6
7#define P(f) if (desc->status & f) printk("%14s set\n", #f)
8#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
9
10static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
11{
12 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
13 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
14 printk("->handle_irq(): %p, ", desc->handle_irq);
15 print_symbol("%s\n", (unsigned long)desc->handle_irq);
16 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
17 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
18 printk("->action(): %p\n", desc->action);
19 if (desc->action) {
20 printk("->action->handler(): %p, ", desc->action->handler);
21 print_symbol("%s\n", (unsigned long)desc->action->handler);
22 }
23
24 P(IRQ_LEVEL);
25 P(IRQ_PER_CPU);
26 P(IRQ_NOPROBE);
27 P(IRQ_NOREQUEST);
28 P(IRQ_NOAUTOEN);
29
30 PS(IRQS_AUTODETECT);
31 PS(IRQS_INPROGRESS);
32 PS(IRQS_REPLAY);
33 PS(IRQS_WAITING);
34 PS(IRQS_DISABLED);
35 PS(IRQS_PENDING);
36 PS(IRQS_MASKED);
37}
38
39#undef P
40#undef PS
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3540a7190122..517561fc7317 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
51 "but no thread function available.", irq, action->name); 51 "but no thread function available.", irq, action->name);
52} 52}
53 53
54/** 54static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
55 * handle_IRQ_event - irq action chain handler 55{
56 * @irq: the interrupt number 56 /*
57 * @action: the interrupt action chain for this irq 57 * Wake up the handler thread for this action. In case the
58 * 58 * thread crashed and was killed we just pretend that we
59 * Handles the action chain of an irq event 59 * handled the interrupt. The hardirq handler has disabled the
60 */ 60 * device interrupt, so no irq storm is lurking. If the
61irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) 61 * RUNTHREAD bit is already set, nothing to do.
62 */
63 if (test_bit(IRQTF_DIED, &action->thread_flags) ||
64 test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
65 return;
66
67 /*
68 * It's safe to OR the mask lockless here. We have only two
69 * places which write to threads_oneshot: This code and the
70 * irq thread.
71 *
72 * This code is the hard irq context and can never run on two
73 * cpus in parallel. If it ever does we have more serious
74 * problems than this bitmask.
75 *
76 * The irq threads of this irq which clear their "running" bit
77 * in threads_oneshot are serialized via desc->lock against
78 * each other and they are serialized against this code by
79 * IRQS_INPROGRESS.
80 *
81 * Hard irq handler:
82 *
83 * spin_lock(desc->lock);
84 * desc->state |= IRQS_INPROGRESS;
85 * spin_unlock(desc->lock);
86 * set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
87 * desc->threads_oneshot |= mask;
88 * spin_lock(desc->lock);
89 * desc->state &= ~IRQS_INPROGRESS;
90 * spin_unlock(desc->lock);
91 *
92 * irq thread:
93 *
94 * again:
95 * spin_lock(desc->lock);
96 * if (desc->state & IRQS_INPROGRESS) {
97 * spin_unlock(desc->lock);
98 * while(desc->state & IRQS_INPROGRESS)
99 * cpu_relax();
100 * goto again;
101 * }
102 * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
103 * desc->threads_oneshot &= ~mask;
104 * spin_unlock(desc->lock);
105 *
106 * So either the thread waits for us to clear IRQS_INPROGRESS
107 * or we are waiting in the flow handler for desc->lock to be
108 * released before we reach this point. The thread also checks
109 * IRQTF_RUNTHREAD under desc->lock. If set it leaves
110 * threads_oneshot untouched and runs the thread another time.
111 */
112 desc->threads_oneshot |= action->thread_mask;
113 wake_up_process(action->thread);
114}
115
116irqreturn_t
117handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
62{ 118{
63 irqreturn_t ret, retval = IRQ_NONE; 119 irqreturn_t retval = IRQ_NONE;
64 unsigned int status = 0; 120 unsigned int random = 0, irq = desc->irq_data.irq;
65 121
66 do { 122 do {
123 irqreturn_t res;
124
67 trace_irq_handler_entry(irq, action); 125 trace_irq_handler_entry(irq, action);
68 ret = action->handler(irq, action->dev_id); 126 res = action->handler(irq, action->dev_id);
69 trace_irq_handler_exit(irq, action, ret); 127 trace_irq_handler_exit(irq, action, res);
70 128
71 switch (ret) { 129 if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
130 irq, action->handler))
131 local_irq_disable();
132
133 switch (res) {
72 case IRQ_WAKE_THREAD: 134 case IRQ_WAKE_THREAD:
73 /* 135 /*
74 * Set result to handled so the spurious check 136 * Set result to handled so the spurious check
75 * does not trigger. 137 * does not trigger.
76 */ 138 */
77 ret = IRQ_HANDLED; 139 res = IRQ_HANDLED;
78 140
79 /* 141 /*
80 * Catch drivers which return WAKE_THREAD but 142 * Catch drivers which return WAKE_THREAD but
@@ -85,36 +147,56 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
85 break; 147 break;
86 } 148 }
87 149
88 /* 150 irq_wake_thread(desc, action);
89 * Wake up the handler thread for this
90 * action. In case the thread crashed and was
91 * killed we just pretend that we handled the
92 * interrupt. The hardirq handler above has
93 * disabled the device interrupt, so no irq
94 * storm is lurking.
95 */
96 if (likely(!test_bit(IRQTF_DIED,
97 &action->thread_flags))) {
98 set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
99 wake_up_process(action->thread);
100 }
101 151
102 /* Fall through to add to randomness */ 152 /* Fall through to add to randomness */
103 case IRQ_HANDLED: 153 case IRQ_HANDLED:
104 status |= action->flags; 154 random |= action->flags;
105 break; 155 break;
106 156
107 default: 157 default:
108 break; 158 break;
109 } 159 }
110 160
111 retval |= ret; 161 retval |= res;
112 action = action->next; 162 action = action->next;
113 } while (action); 163 } while (action);
114 164
115 if (status & IRQF_SAMPLE_RANDOM) 165 if (random & IRQF_SAMPLE_RANDOM)
116 add_interrupt_randomness(irq); 166 add_interrupt_randomness(irq);
117 local_irq_disable();
118 167
168 if (!noirqdebug)
169 note_interrupt(irq, desc, retval);
119 return retval; 170 return retval;
120} 171}
172
173irqreturn_t handle_irq_event(struct irq_desc *desc)
174{
175 struct irqaction *action = desc->action;
176 irqreturn_t ret;
177
178 irq_compat_clr_pending(desc);
179 desc->istate &= ~IRQS_PENDING;
180 irq_compat_set_progress(desc);
181 desc->istate |= IRQS_INPROGRESS;
182 raw_spin_unlock(&desc->lock);
183
184 ret = handle_irq_event_percpu(desc, action);
185
186 raw_spin_lock(&desc->lock);
187 desc->istate &= ~IRQS_INPROGRESS;
188 irq_compat_clr_progress(desc);
189 return ret;
190}
191
192/**
193 * handle_IRQ_event - irq action chain handler
194 * @irq: the interrupt number
195 * @action: the interrupt action chain for this irq
196 *
197 * Handles the action chain of an irq event
198 */
199irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
200{
201 return handle_irq_event_percpu(irq_to_desc(irq), action);
202}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 99c3bc8a6fb4..6c6ec9a49027 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,5 +1,9 @@
1/* 1/*
2 * IRQ subsystem internal functions and variables: 2 * IRQ subsystem internal functions and variables:
3 *
4 * Do not ever include this file from anything else than
5 * kernel/irq/. Do not even think about using any information outside
6 * of this file for your non core code.
3 */ 7 */
4#include <linux/irqdesc.h> 8#include <linux/irqdesc.h>
5 9
@@ -9,25 +13,89 @@
9# define IRQ_BITMAP_BITS NR_IRQS 13# define IRQ_BITMAP_BITS NR_IRQS
10#endif 14#endif
11 15
16#define istate core_internal_state__do_not_mess_with_it
17
18#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT
19# define status status_use_accessors
20#endif
21
12extern int noirqdebug; 22extern int noirqdebug;
13 23
24/*
25 * Bits used by threaded handlers:
26 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
27 * IRQTF_DIED - handler thread died
28 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
29 * IRQTF_AFFINITY - irq thread is requested to adjust affinity
30 * IRQTF_FORCED_THREAD - irq action is force threaded
31 */
32enum {
33 IRQTF_RUNTHREAD,
34 IRQTF_DIED,
35 IRQTF_WARNED,
36 IRQTF_AFFINITY,
37 IRQTF_FORCED_THREAD,
38};
39
40/*
41 * Bit masks for desc->state
42 *
43 * IRQS_AUTODETECT - autodetection in progress
44 * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
45 * detection
46 * IRQS_POLL_INPROGRESS - polling in progress
47 * IRQS_INPROGRESS - Interrupt in progress
48 * IRQS_ONESHOT - irq is not unmasked in primary handler
49 * IRQS_REPLAY - irq is replayed
50 * IRQS_WAITING - irq is waiting
51 * IRQS_DISABLED - irq is disabled
52 * IRQS_PENDING - irq is pending and replayed later
53 * IRQS_MASKED - irq is masked
54 * IRQS_SUSPENDED - irq is suspended
55 */
56enum {
57 IRQS_AUTODETECT = 0x00000001,
58 IRQS_SPURIOUS_DISABLED = 0x00000002,
59 IRQS_POLL_INPROGRESS = 0x00000008,
60 IRQS_INPROGRESS = 0x00000010,
61 IRQS_ONESHOT = 0x00000020,
62 IRQS_REPLAY = 0x00000040,
63 IRQS_WAITING = 0x00000080,
64 IRQS_DISABLED = 0x00000100,
65 IRQS_PENDING = 0x00000200,
66 IRQS_MASKED = 0x00000400,
67 IRQS_SUSPENDED = 0x00000800,
68};
69
70#include "compat.h"
71#include "debug.h"
72#include "settings.h"
73
14#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) 74#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
15 75
16/* Set default functions for irq_chip structures: */ 76/* Set default functions for irq_chip structures: */
17extern void irq_chip_set_defaults(struct irq_chip *chip); 77extern void irq_chip_set_defaults(struct irq_chip *chip);
18 78
19/* Set default handler: */
20extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
21
22extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 79extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
23 unsigned long flags); 80 unsigned long flags);
24extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 81extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
25extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 82extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
26 83
84extern int irq_startup(struct irq_desc *desc);
85extern void irq_shutdown(struct irq_desc *desc);
86extern void irq_enable(struct irq_desc *desc);
87extern void irq_disable(struct irq_desc *desc);
88extern void mask_irq(struct irq_desc *desc);
89extern void unmask_irq(struct irq_desc *desc);
90
27extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 91extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
28 92
93irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
94irqreturn_t handle_irq_event(struct irq_desc *desc);
95
29/* Resending of interrupts :*/ 96/* Resending of interrupts :*/
30void check_irq_resend(struct irq_desc *desc, unsigned int irq); 97void check_irq_resend(struct irq_desc *desc, unsigned int irq);
98bool irq_wait_for_poll(struct irq_desc *desc);
31 99
32#ifdef CONFIG_PROC_FS 100#ifdef CONFIG_PROC_FS
33extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 101extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -43,20 +111,10 @@ static inline void unregister_handler_proc(unsigned int irq,
43 struct irqaction *action) { } 111 struct irqaction *action) { }
44#endif 112#endif
45 113
46extern int irq_select_affinity_usr(unsigned int irq); 114extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
47 115
48extern void irq_set_thread_affinity(struct irq_desc *desc); 116extern void irq_set_thread_affinity(struct irq_desc *desc);
49 117
50#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
51static inline void irq_end(unsigned int irq, struct irq_desc *desc)
52{
53 if (desc->irq_data.chip && desc->irq_data.chip->end)
54 desc->irq_data.chip->end(irq);
55}
56#else
57static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
58#endif
59
60/* Inline functions for support of irq chips on slow busses */ 118/* Inline functions for support of irq chips on slow busses */
61static inline void chip_bus_lock(struct irq_desc *desc) 119static inline void chip_bus_lock(struct irq_desc *desc)
62{ 120{
@@ -70,43 +128,60 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
70 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); 128 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
71} 129}
72 130
131struct irq_desc *
132__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
133void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
134
135static inline struct irq_desc *
136irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
137{
138 return __irq_get_desc_lock(irq, flags, true);
139}
140
141static inline void
142irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
143{
144 __irq_put_desc_unlock(desc, flags, true);
145}
146
147static inline struct irq_desc *
148irq_get_desc_lock(unsigned int irq, unsigned long *flags)
149{
150 return __irq_get_desc_lock(irq, flags, false);
151}
152
153static inline void
154irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
155{
156 __irq_put_desc_unlock(desc, flags, false);
157}
158
73/* 159/*
74 * Debugging printout: 160 * Manipulation functions for irq_data.state
75 */ 161 */
162static inline void irqd_set_move_pending(struct irq_data *d)
163{
164 d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
165 irq_compat_set_move_pending(irq_data_to_desc(d));
166}
76 167
77#include <linux/kallsyms.h> 168static inline void irqd_clr_move_pending(struct irq_data *d)
78 169{
79#define P(f) if (desc->status & f) printk("%14s set\n", #f) 170 d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
171 irq_compat_clr_move_pending(irq_data_to_desc(d));
172}
80 173
81static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) 174static inline void irqd_clear(struct irq_data *d, unsigned int mask)
82{ 175{
83 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", 176 d->state_use_accessors &= ~mask;
84 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
85 printk("->handle_irq(): %p, ", desc->handle_irq);
86 print_symbol("%s\n", (unsigned long)desc->handle_irq);
87 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
88 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
89 printk("->action(): %p\n", desc->action);
90 if (desc->action) {
91 printk("->action->handler(): %p, ", desc->action->handler);
92 print_symbol("%s\n", (unsigned long)desc->action->handler);
93 }
94
95 P(IRQ_INPROGRESS);
96 P(IRQ_DISABLED);
97 P(IRQ_PENDING);
98 P(IRQ_REPLAY);
99 P(IRQ_AUTODETECT);
100 P(IRQ_WAITING);
101 P(IRQ_LEVEL);
102 P(IRQ_MASKED);
103#ifdef CONFIG_IRQ_PER_CPU
104 P(IRQ_PER_CPU);
105#endif
106 P(IRQ_NOPROBE);
107 P(IRQ_NOREQUEST);
108 P(IRQ_NOAUTOEN);
109} 177}
110 178
111#undef P 179static inline void irqd_set(struct irq_data *d, unsigned int mask)
180{
181 d->state_use_accessors |= mask;
182}
112 183
184static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
185{
186 return d->state_use_accessors & mask;
187}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2039bea31bdf..dbccc799407f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -79,7 +79,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
79 desc->irq_data.chip_data = NULL; 79 desc->irq_data.chip_data = NULL;
80 desc->irq_data.handler_data = NULL; 80 desc->irq_data.handler_data = NULL;
81 desc->irq_data.msi_desc = NULL; 81 desc->irq_data.msi_desc = NULL;
82 desc->status = IRQ_DEFAULT_INIT_FLAGS; 82 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
83 desc->istate = IRQS_DISABLED;
83 desc->handle_irq = handle_bad_irq; 84 desc->handle_irq = handle_bad_irq;
84 desc->depth = 1; 85 desc->depth = 1;
85 desc->irq_count = 0; 86 desc->irq_count = 0;
@@ -206,6 +207,14 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
206 return NULL; 207 return NULL;
207} 208}
208 209
210static int irq_expand_nr_irqs(unsigned int nr)
211{
212 if (nr > IRQ_BITMAP_BITS)
213 return -ENOMEM;
214 nr_irqs = nr;
215 return 0;
216}
217
209int __init early_irq_init(void) 218int __init early_irq_init(void)
210{ 219{
211 int i, initcnt, node = first_online_node; 220 int i, initcnt, node = first_online_node;
@@ -238,7 +247,7 @@ int __init early_irq_init(void)
238 247
239struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 248struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
240 [0 ... NR_IRQS-1] = { 249 [0 ... NR_IRQS-1] = {
241 .status = IRQ_DEFAULT_INIT_FLAGS, 250 .istate = IRQS_DISABLED,
242 .handle_irq = handle_bad_irq, 251 .handle_irq = handle_bad_irq,
243 .depth = 1, 252 .depth = 1,
244 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), 253 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
@@ -260,8 +269,8 @@ int __init early_irq_init(void)
260 for (i = 0; i < count; i++) { 269 for (i = 0; i < count; i++) {
261 desc[i].irq_data.irq = i; 270 desc[i].irq_data.irq = i;
262 desc[i].irq_data.chip = &no_irq_chip; 271 desc[i].irq_data.chip = &no_irq_chip;
263 /* TODO : do this allocation on-demand ... */
264 desc[i].kstat_irqs = alloc_percpu(unsigned int); 272 desc[i].kstat_irqs = alloc_percpu(unsigned int);
273 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
265 alloc_masks(desc + i, GFP_KERNEL, node); 274 alloc_masks(desc + i, GFP_KERNEL, node);
266 desc_smp_init(desc + i, node); 275 desc_smp_init(desc + i, node);
267 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 276 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -286,24 +295,14 @@ static void free_desc(unsigned int irq)
286 295
287static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) 296static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
288{ 297{
289#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
290 struct irq_desc *desc;
291 unsigned int i;
292
293 for (i = 0; i < cnt; i++) {
294 desc = irq_to_desc(start + i);
295 if (desc && !desc->kstat_irqs) {
296 unsigned int __percpu *stats = alloc_percpu(unsigned int);
297
298 if (!stats)
299 return -1;
300 if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
301 free_percpu(stats);
302 }
303 }
304#endif
305 return start; 298 return start;
306} 299}
300
301static int irq_expand_nr_irqs(unsigned int nr)
302{
303 return -ENOMEM;
304}
305
307#endif /* !CONFIG_SPARSE_IRQ */ 306#endif /* !CONFIG_SPARSE_IRQ */
308 307
309/* Dynamic interrupt handling */ 308/* Dynamic interrupt handling */
@@ -347,14 +346,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
347 346
348 mutex_lock(&sparse_irq_lock); 347 mutex_lock(&sparse_irq_lock);
349 348
350 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); 349 start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
350 from, cnt, 0);
351 ret = -EEXIST; 351 ret = -EEXIST;
352 if (irq >=0 && start != irq) 352 if (irq >=0 && start != irq)
353 goto err; 353 goto err;
354 354
355 ret = -ENOMEM; 355 if (start + cnt > nr_irqs) {
356 if (start >= nr_irqs) 356 ret = irq_expand_nr_irqs(start + cnt);
357 goto err; 357 if (ret)
358 goto err;
359 }
358 360
359 bitmap_set(allocated_irqs, start, cnt); 361 bitmap_set(allocated_irqs, start, cnt);
360 mutex_unlock(&sparse_irq_lock); 362 mutex_unlock(&sparse_irq_lock);
@@ -401,6 +403,26 @@ unsigned int irq_get_next_irq(unsigned int offset)
401 return find_next_bit(allocated_irqs, nr_irqs, offset); 403 return find_next_bit(allocated_irqs, nr_irqs, offset);
402} 404}
403 405
406struct irq_desc *
407__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
408{
409 struct irq_desc *desc = irq_to_desc(irq);
410
411 if (desc) {
412 if (bus)
413 chip_bus_lock(desc);
414 raw_spin_lock_irqsave(&desc->lock, *flags);
415 }
416 return desc;
417}
418
419void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
420{
421 raw_spin_unlock_irqrestore(&desc->lock, flags);
422 if (bus)
423 chip_bus_sync_unlock(desc);
424}
425
404/** 426/**
405 * dynamic_irq_cleanup - cleanup a dynamically allocated irq 427 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
406 * @irq: irq number to initialize 428 * @irq: irq number to initialize
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9033c1c70828..acd599a43bfb 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,17 @@
17 17
18#include "internals.h" 18#include "internals.h"
19 19
20#ifdef CONFIG_IRQ_FORCED_THREADING
21__read_mostly bool force_irqthreads;
22
23static int __init setup_forced_irqthreads(char *arg)
24{
25 force_irqthreads = true;
26 return 0;
27}
28early_param("threadirqs", setup_forced_irqthreads);
29#endif
30
20/** 31/**
21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 32 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
22 * @irq: interrupt number to wait for 33 * @irq: interrupt number to wait for
@@ -30,7 +41,7 @@
30void synchronize_irq(unsigned int irq) 41void synchronize_irq(unsigned int irq)
31{ 42{
32 struct irq_desc *desc = irq_to_desc(irq); 43 struct irq_desc *desc = irq_to_desc(irq);
33 unsigned int status; 44 unsigned int state;
34 45
35 if (!desc) 46 if (!desc)
36 return; 47 return;
@@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq)
42 * Wait until we're out of the critical section. This might 53 * Wait until we're out of the critical section. This might
43 * give the wrong answer due to the lack of memory barriers. 54 * give the wrong answer due to the lack of memory barriers.
44 */ 55 */
45 while (desc->status & IRQ_INPROGRESS) 56 while (desc->istate & IRQS_INPROGRESS)
46 cpu_relax(); 57 cpu_relax();
47 58
48 /* Ok, that indicated we're done: double-check carefully. */ 59 /* Ok, that indicated we're done: double-check carefully. */
49 raw_spin_lock_irqsave(&desc->lock, flags); 60 raw_spin_lock_irqsave(&desc->lock, flags);
50 status = desc->status; 61 state = desc->istate;
51 raw_spin_unlock_irqrestore(&desc->lock, flags); 62 raw_spin_unlock_irqrestore(&desc->lock, flags);
52 63
53 /* Oops, that failed? */ 64 /* Oops, that failed? */
54 } while (status & IRQ_INPROGRESS); 65 } while (state & IRQS_INPROGRESS);
55 66
56 /* 67 /*
57 * We made sure that no hardirq handler is running. Now verify 68 * We made sure that no hardirq handler is running. Now verify
@@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq)
73{ 84{
74 struct irq_desc *desc = irq_to_desc(irq); 85 struct irq_desc *desc = irq_to_desc(irq);
75 86
76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || 87 if (!desc || !irqd_can_balance(&desc->irq_data) ||
77 !desc->irq_data.chip->irq_set_affinity) 88 !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
78 return 0; 89 return 0;
79 90
80 return 1; 91 return 1;
@@ -100,67 +111,169 @@ void irq_set_thread_affinity(struct irq_desc *desc)
100 } 111 }
101} 112}
102 113
114#ifdef CONFIG_GENERIC_PENDING_IRQ
115static inline bool irq_can_move_pcntxt(struct irq_desc *desc)
116{
117 return irq_settings_can_move_pcntxt(desc);
118}
119static inline bool irq_move_pending(struct irq_desc *desc)
120{
121 return irqd_is_setaffinity_pending(&desc->irq_data);
122}
123static inline void
124irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
125{
126 cpumask_copy(desc->pending_mask, mask);
127}
128static inline void
129irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
130{
131 cpumask_copy(mask, desc->pending_mask);
132}
133#else
134static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; }
135static inline bool irq_move_pending(struct irq_desc *desc) { return false; }
136static inline void
137irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
138static inline void
139irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
140#endif
141
103/** 142/**
104 * irq_set_affinity - Set the irq affinity of a given irq 143 * irq_set_affinity - Set the irq affinity of a given irq
105 * @irq: Interrupt to set affinity 144 * @irq: Interrupt to set affinity
106 * @cpumask: cpumask 145 * @cpumask: cpumask
107 * 146 *
108 */ 147 */
109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) 148int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
110{ 149{
111 struct irq_desc *desc = irq_to_desc(irq); 150 struct irq_desc *desc = irq_to_desc(irq);
112 struct irq_chip *chip = desc->irq_data.chip; 151 struct irq_chip *chip = desc->irq_data.chip;
113 unsigned long flags; 152 unsigned long flags;
153 int ret = 0;
114 154
115 if (!chip->irq_set_affinity) 155 if (!chip->irq_set_affinity)
116 return -EINVAL; 156 return -EINVAL;
117 157
118 raw_spin_lock_irqsave(&desc->lock, flags); 158 raw_spin_lock_irqsave(&desc->lock, flags);
119 159
120#ifdef CONFIG_GENERIC_PENDING_IRQ 160 if (irq_can_move_pcntxt(desc)) {
121 if (desc->status & IRQ_MOVE_PCNTXT) { 161 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
122 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { 162 switch (ret) {
123 cpumask_copy(desc->irq_data.affinity, cpumask); 163 case IRQ_SET_MASK_OK:
164 cpumask_copy(desc->irq_data.affinity, mask);
165 case IRQ_SET_MASK_OK_NOCOPY:
124 irq_set_thread_affinity(desc); 166 irq_set_thread_affinity(desc);
167 ret = 0;
125 } 168 }
169 } else {
170 irqd_set_move_pending(&desc->irq_data);
171 irq_copy_pending(desc, mask);
126 } 172 }
127 else { 173
128 desc->status |= IRQ_MOVE_PENDING; 174 if (desc->affinity_notify) {
129 cpumask_copy(desc->pending_mask, cpumask); 175 kref_get(&desc->affinity_notify->kref);
130 } 176 schedule_work(&desc->affinity_notify->work);
131#else
132 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
133 cpumask_copy(desc->irq_data.affinity, cpumask);
134 irq_set_thread_affinity(desc);
135 } 177 }
136#endif 178 irq_compat_set_affinity(desc);
137 desc->status |= IRQ_AFFINITY_SET; 179 irqd_set(&desc->irq_data, IRQD_AFFINITY_SET);
138 raw_spin_unlock_irqrestore(&desc->lock, flags); 180 raw_spin_unlock_irqrestore(&desc->lock, flags);
139 return 0; 181 return ret;
140} 182}
141 183
142int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) 184int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
143{ 185{
186 unsigned long flags;
187 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
188
189 if (!desc)
190 return -EINVAL;
191 desc->affinity_hint = m;
192 irq_put_desc_unlock(desc, flags);
193 return 0;
194}
195EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
196
197static void irq_affinity_notify(struct work_struct *work)
198{
199 struct irq_affinity_notify *notify =
200 container_of(work, struct irq_affinity_notify, work);
201 struct irq_desc *desc = irq_to_desc(notify->irq);
202 cpumask_var_t cpumask;
203 unsigned long flags;
204
205 if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
206 goto out;
207
208 raw_spin_lock_irqsave(&desc->lock, flags);
209 if (irq_move_pending(desc))
210 irq_get_pending(cpumask, desc);
211 else
212 cpumask_copy(cpumask, desc->irq_data.affinity);
213 raw_spin_unlock_irqrestore(&desc->lock, flags);
214
215 notify->notify(notify, cpumask);
216
217 free_cpumask_var(cpumask);
218out:
219 kref_put(&notify->kref, notify->release);
220}
221
222/**
223 * irq_set_affinity_notifier - control notification of IRQ affinity changes
224 * @irq: Interrupt for which to enable/disable notification
225 * @notify: Context for notification, or %NULL to disable
226 * notification. Function pointers must be initialised;
227 * the other fields will be initialised by this function.
228 *
229 * Must be called in process context. Notification may only be enabled
230 * after the IRQ is allocated and must be disabled before the IRQ is
231 * freed using free_irq().
232 */
233int
234irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
235{
144 struct irq_desc *desc = irq_to_desc(irq); 236 struct irq_desc *desc = irq_to_desc(irq);
237 struct irq_affinity_notify *old_notify;
145 unsigned long flags; 238 unsigned long flags;
146 239
240 /* The release function is promised process context */
241 might_sleep();
242
147 if (!desc) 243 if (!desc)
148 return -EINVAL; 244 return -EINVAL;
149 245
246 /* Complete initialisation of *notify */
247 if (notify) {
248 notify->irq = irq;
249 kref_init(&notify->kref);
250 INIT_WORK(&notify->work, irq_affinity_notify);
251 }
252
150 raw_spin_lock_irqsave(&desc->lock, flags); 253 raw_spin_lock_irqsave(&desc->lock, flags);
151 desc->affinity_hint = m; 254 old_notify = desc->affinity_notify;
255 desc->affinity_notify = notify;
152 raw_spin_unlock_irqrestore(&desc->lock, flags); 256 raw_spin_unlock_irqrestore(&desc->lock, flags);
153 257
258 if (old_notify)
259 kref_put(&old_notify->kref, old_notify->release);
260
154 return 0; 261 return 0;
155} 262}
156EXPORT_SYMBOL_GPL(irq_set_affinity_hint); 263EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
157 264
158#ifndef CONFIG_AUTO_IRQ_AFFINITY 265#ifndef CONFIG_AUTO_IRQ_AFFINITY
159/* 266/*
160 * Generic version of the affinity autoselector. 267 * Generic version of the affinity autoselector.
161 */ 268 */
162static int setup_affinity(unsigned int irq, struct irq_desc *desc) 269static int
270setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
163{ 271{
272 struct irq_chip *chip = irq_desc_get_chip(desc);
273 struct cpumask *set = irq_default_affinity;
274 int ret;
275
276 /* Excludes PER_CPU and NO_BALANCE interrupts */
164 if (!irq_can_set_affinity(irq)) 277 if (!irq_can_set_affinity(irq))
165 return 0; 278 return 0;
166 279
@@ -168,22 +281,29 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
168 * Preserve an userspace affinity setup, but make sure that 281 * Preserve an userspace affinity setup, but make sure that
169 * one of the targets is online. 282 * one of the targets is online.
170 */ 283 */
171 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 284 if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
172 if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) 285 if (cpumask_intersects(desc->irq_data.affinity,
173 < nr_cpu_ids) 286 cpu_online_mask))
174 goto set_affinity; 287 set = desc->irq_data.affinity;
175 else 288 else {
176 desc->status &= ~IRQ_AFFINITY_SET; 289 irq_compat_clr_affinity(desc);
290 irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
291 }
177 } 292 }
178 293
179 cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); 294 cpumask_and(mask, cpu_online_mask, set);
180set_affinity: 295 ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
181 desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); 296 switch (ret) {
182 297 case IRQ_SET_MASK_OK:
298 cpumask_copy(desc->irq_data.affinity, mask);
299 case IRQ_SET_MASK_OK_NOCOPY:
300 irq_set_thread_affinity(desc);
301 }
183 return 0; 302 return 0;
184} 303}
185#else 304#else
186static inline int setup_affinity(unsigned int irq, struct irq_desc *d) 305static inline int
306setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
187{ 307{
188 return irq_select_affinity(irq); 308 return irq_select_affinity(irq);
189} 309}
@@ -192,23 +312,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
192/* 312/*
193 * Called when affinity is set via /proc/irq 313 * Called when affinity is set via /proc/irq
194 */ 314 */
195int irq_select_affinity_usr(unsigned int irq) 315int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
196{ 316{
197 struct irq_desc *desc = irq_to_desc(irq); 317 struct irq_desc *desc = irq_to_desc(irq);
198 unsigned long flags; 318 unsigned long flags;
199 int ret; 319 int ret;
200 320
201 raw_spin_lock_irqsave(&desc->lock, flags); 321 raw_spin_lock_irqsave(&desc->lock, flags);
202 ret = setup_affinity(irq, desc); 322 ret = setup_affinity(irq, desc, mask);
203 if (!ret)
204 irq_set_thread_affinity(desc);
205 raw_spin_unlock_irqrestore(&desc->lock, flags); 323 raw_spin_unlock_irqrestore(&desc->lock, flags);
206
207 return ret; 324 return ret;
208} 325}
209 326
210#else 327#else
211static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) 328static inline int
329setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
212{ 330{
213 return 0; 331 return 0;
214} 332}
@@ -219,13 +337,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
219 if (suspend) { 337 if (suspend) {
220 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) 338 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
221 return; 339 return;
222 desc->status |= IRQ_SUSPENDED; 340 desc->istate |= IRQS_SUSPENDED;
223 } 341 }
224 342
225 if (!desc->depth++) { 343 if (!desc->depth++)
226 desc->status |= IRQ_DISABLED; 344 irq_disable(desc);
227 desc->irq_data.chip->irq_disable(&desc->irq_data); 345}
228 } 346
347static int __disable_irq_nosync(unsigned int irq)
348{
349 unsigned long flags;
350 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
351
352 if (!desc)
353 return -EINVAL;
354 __disable_irq(desc, irq, false);
355 irq_put_desc_busunlock(desc, flags);
356 return 0;
229} 357}
230 358
231/** 359/**
@@ -241,17 +369,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
241 */ 369 */
242void disable_irq_nosync(unsigned int irq) 370void disable_irq_nosync(unsigned int irq)
243{ 371{
244 struct irq_desc *desc = irq_to_desc(irq); 372 __disable_irq_nosync(irq);
245 unsigned long flags;
246
247 if (!desc)
248 return;
249
250 chip_bus_lock(desc);
251 raw_spin_lock_irqsave(&desc->lock, flags);
252 __disable_irq(desc, irq, false);
253 raw_spin_unlock_irqrestore(&desc->lock, flags);
254 chip_bus_sync_unlock(desc);
255} 373}
256EXPORT_SYMBOL(disable_irq_nosync); 374EXPORT_SYMBOL(disable_irq_nosync);
257 375
@@ -269,21 +387,24 @@ EXPORT_SYMBOL(disable_irq_nosync);
269 */ 387 */
270void disable_irq(unsigned int irq) 388void disable_irq(unsigned int irq)
271{ 389{
272 struct irq_desc *desc = irq_to_desc(irq); 390 if (!__disable_irq_nosync(irq))
273
274 if (!desc)
275 return;
276
277 disable_irq_nosync(irq);
278 if (desc->action)
279 synchronize_irq(irq); 391 synchronize_irq(irq);
280} 392}
281EXPORT_SYMBOL(disable_irq); 393EXPORT_SYMBOL(disable_irq);
282 394
283void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) 395void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
284{ 396{
285 if (resume) 397 if (resume) {
286 desc->status &= ~IRQ_SUSPENDED; 398 if (!(desc->istate & IRQS_SUSPENDED)) {
399 if (!desc->action)
400 return;
401 if (!(desc->action->flags & IRQF_FORCE_RESUME))
402 return;
403 /* Pretend that it got disabled ! */
404 desc->depth++;
405 }
406 desc->istate &= ~IRQS_SUSPENDED;
407 }
287 408
288 switch (desc->depth) { 409 switch (desc->depth) {
289 case 0: 410 case 0:
@@ -291,12 +412,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
291 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); 412 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
292 break; 413 break;
293 case 1: { 414 case 1: {
294 unsigned int status = desc->status & ~IRQ_DISABLED; 415 if (desc->istate & IRQS_SUSPENDED)
295
296 if (desc->status & IRQ_SUSPENDED)
297 goto err_out; 416 goto err_out;
298 /* Prevent probing on this irq: */ 417 /* Prevent probing on this irq: */
299 desc->status = status | IRQ_NOPROBE; 418 irq_settings_set_noprobe(desc);
419 irq_enable(desc);
300 check_irq_resend(desc, irq); 420 check_irq_resend(desc, irq);
301 /* fall-through */ 421 /* fall-through */
302 } 422 }
@@ -318,21 +438,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
318 */ 438 */
319void enable_irq(unsigned int irq) 439void enable_irq(unsigned int irq)
320{ 440{
321 struct irq_desc *desc = irq_to_desc(irq);
322 unsigned long flags; 441 unsigned long flags;
442 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
323 443
324 if (!desc) 444 if (!desc)
325 return; 445 return;
446 if (WARN(!desc->irq_data.chip,
447 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
448 goto out;
326 449
327 if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
328 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
329 return;
330
331 chip_bus_lock(desc);
332 raw_spin_lock_irqsave(&desc->lock, flags);
333 __enable_irq(desc, irq, false); 450 __enable_irq(desc, irq, false);
334 raw_spin_unlock_irqrestore(&desc->lock, flags); 451out:
335 chip_bus_sync_unlock(desc); 452 irq_put_desc_busunlock(desc, flags);
336} 453}
337EXPORT_SYMBOL(enable_irq); 454EXPORT_SYMBOL(enable_irq);
338 455
@@ -348,7 +465,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
348} 465}
349 466
350/** 467/**
351 * set_irq_wake - control irq power management wakeup 468 * irq_set_irq_wake - control irq power management wakeup
352 * @irq: interrupt to control 469 * @irq: interrupt to control
353 * @on: enable/disable power management wakeup 470 * @on: enable/disable power management wakeup
354 * 471 *
@@ -359,23 +476,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
359 * Wakeup mode lets this IRQ wake the system from sleep 476 * Wakeup mode lets this IRQ wake the system from sleep
360 * states like "suspend to RAM". 477 * states like "suspend to RAM".
361 */ 478 */
362int set_irq_wake(unsigned int irq, unsigned int on) 479int irq_set_irq_wake(unsigned int irq, unsigned int on)
363{ 480{
364 struct irq_desc *desc = irq_to_desc(irq);
365 unsigned long flags; 481 unsigned long flags;
482 struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
366 int ret = 0; 483 int ret = 0;
367 484
368 /* wakeup-capable irqs can be shared between drivers that 485 /* wakeup-capable irqs can be shared between drivers that
369 * don't need to have the same sleep mode behaviors. 486 * don't need to have the same sleep mode behaviors.
370 */ 487 */
371 raw_spin_lock_irqsave(&desc->lock, flags);
372 if (on) { 488 if (on) {
373 if (desc->wake_depth++ == 0) { 489 if (desc->wake_depth++ == 0) {
374 ret = set_irq_wake_real(irq, on); 490 ret = set_irq_wake_real(irq, on);
375 if (ret) 491 if (ret)
376 desc->wake_depth = 0; 492 desc->wake_depth = 0;
377 else 493 else
378 desc->status |= IRQ_WAKEUP; 494 irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
379 } 495 }
380 } else { 496 } else {
381 if (desc->wake_depth == 0) { 497 if (desc->wake_depth == 0) {
@@ -385,14 +501,13 @@ int set_irq_wake(unsigned int irq, unsigned int on)
385 if (ret) 501 if (ret)
386 desc->wake_depth = 1; 502 desc->wake_depth = 1;
387 else 503 else
388 desc->status &= ~IRQ_WAKEUP; 504 irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
389 } 505 }
390 } 506 }
391 507 irq_put_desc_busunlock(desc, flags);
392 raw_spin_unlock_irqrestore(&desc->lock, flags);
393 return ret; 508 return ret;
394} 509}
395EXPORT_SYMBOL(set_irq_wake); 510EXPORT_SYMBOL(irq_set_irq_wake);
396 511
397/* 512/*
398 * Internal function that tells the architecture code whether a 513 * Internal function that tells the architecture code whether a
@@ -401,43 +516,27 @@ EXPORT_SYMBOL(set_irq_wake);
401 */ 516 */
402int can_request_irq(unsigned int irq, unsigned long irqflags) 517int can_request_irq(unsigned int irq, unsigned long irqflags)
403{ 518{
404 struct irq_desc *desc = irq_to_desc(irq);
405 struct irqaction *action;
406 unsigned long flags; 519 unsigned long flags;
520 struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
521 int canrequest = 0;
407 522
408 if (!desc) 523 if (!desc)
409 return 0; 524 return 0;
410 525
411 if (desc->status & IRQ_NOREQUEST) 526 if (irq_settings_can_request(desc)) {
412 return 0; 527 if (desc->action)
413 528 if (irqflags & desc->action->flags & IRQF_SHARED)
414 raw_spin_lock_irqsave(&desc->lock, flags); 529 canrequest =1;
415 action = desc->action; 530 }
416 if (action) 531 irq_put_desc_unlock(desc, flags);
417 if (irqflags & action->flags & IRQF_SHARED) 532 return canrequest;
418 action = NULL;
419
420 raw_spin_unlock_irqrestore(&desc->lock, flags);
421
422 return !action;
423}
424
425void compat_irq_chip_set_default_handler(struct irq_desc *desc)
426{
427 /*
428 * If the architecture still has not overriden
429 * the flow handler then zap the default. This
430 * should catch incorrect flow-type setting.
431 */
432 if (desc->handle_irq == &handle_bad_irq)
433 desc->handle_irq = NULL;
434} 533}
435 534
436int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 535int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
437 unsigned long flags) 536 unsigned long flags)
438{ 537{
439 int ret;
440 struct irq_chip *chip = desc->irq_data.chip; 538 struct irq_chip *chip = desc->irq_data.chip;
539 int ret, unmask = 0;
441 540
442 if (!chip || !chip->irq_set_type) { 541 if (!chip || !chip->irq_set_type) {
443 /* 542 /*
@@ -449,23 +548,43 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
449 return 0; 548 return 0;
450 } 549 }
451 550
551 flags &= IRQ_TYPE_SENSE_MASK;
552
553 if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
554 if (!(desc->istate & IRQS_MASKED))
555 mask_irq(desc);
556 if (!(desc->istate & IRQS_DISABLED))
557 unmask = 1;
558 }
559
452 /* caller masked out all except trigger mode flags */ 560 /* caller masked out all except trigger mode flags */
453 ret = chip->irq_set_type(&desc->irq_data, flags); 561 ret = chip->irq_set_type(&desc->irq_data, flags);
454 562
455 if (ret) 563 switch (ret) {
456 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", 564 case IRQ_SET_MASK_OK:
457 flags, irq, chip->irq_set_type); 565 irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
458 else { 566 irqd_set(&desc->irq_data, flags);
459 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) 567
460 flags |= IRQ_LEVEL; 568 case IRQ_SET_MASK_OK_NOCOPY:
461 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ 569 flags = irqd_get_trigger_type(&desc->irq_data);
462 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); 570 irq_settings_set_trigger_mask(desc, flags);
463 desc->status |= flags; 571 irqd_clear(&desc->irq_data, IRQD_LEVEL);
572 irq_settings_clr_level(desc);
573 if (flags & IRQ_TYPE_LEVEL_MASK) {
574 irq_settings_set_level(desc);
575 irqd_set(&desc->irq_data, IRQD_LEVEL);
576 }
464 577
465 if (chip != desc->irq_data.chip) 578 if (chip != desc->irq_data.chip)
466 irq_chip_set_defaults(desc->irq_data.chip); 579 irq_chip_set_defaults(desc->irq_data.chip);
580 ret = 0;
581 break;
582 default:
583 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
584 flags, irq, chip->irq_set_type);
467 } 585 }
468 586 if (unmask)
587 unmask_irq(desc);
469 return ret; 588 return ret;
470} 589}
471 590
@@ -509,8 +628,11 @@ static int irq_wait_for_interrupt(struct irqaction *action)
509 * handler finished. unmask if the interrupt has not been disabled and 628 * handler finished. unmask if the interrupt has not been disabled and
510 * is marked MASKED. 629 * is marked MASKED.
511 */ 630 */
512static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 631static void irq_finalize_oneshot(struct irq_desc *desc,
632 struct irqaction *action, bool force)
513{ 633{
634 if (!(desc->istate & IRQS_ONESHOT))
635 return;
514again: 636again:
515 chip_bus_lock(desc); 637 chip_bus_lock(desc);
516 raw_spin_lock_irq(&desc->lock); 638 raw_spin_lock_irq(&desc->lock);
@@ -522,26 +644,44 @@ again:
522 * The thread is faster done than the hard interrupt handler 644 * The thread is faster done than the hard interrupt handler
523 * on the other CPU. If we unmask the irq line then the 645 * on the other CPU. If we unmask the irq line then the
524 * interrupt can come in again and masks the line, leaves due 646 * interrupt can come in again and masks the line, leaves due
525 * to IRQ_INPROGRESS and the irq line is masked forever. 647 * to IRQS_INPROGRESS and the irq line is masked forever.
648 *
649 * This also serializes the state of shared oneshot handlers
650 * versus "desc->threads_onehsot |= action->thread_mask;" in
651 * irq_wake_thread(). See the comment there which explains the
652 * serialization.
526 */ 653 */
527 if (unlikely(desc->status & IRQ_INPROGRESS)) { 654 if (unlikely(desc->istate & IRQS_INPROGRESS)) {
528 raw_spin_unlock_irq(&desc->lock); 655 raw_spin_unlock_irq(&desc->lock);
529 chip_bus_sync_unlock(desc); 656 chip_bus_sync_unlock(desc);
530 cpu_relax(); 657 cpu_relax();
531 goto again; 658 goto again;
532 } 659 }
533 660
534 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 661 /*
535 desc->status &= ~IRQ_MASKED; 662 * Now check again, whether the thread should run. Otherwise
663 * we would clear the threads_oneshot bit of this thread which
664 * was just set.
665 */
666 if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
667 goto out_unlock;
668
669 desc->threads_oneshot &= ~action->thread_mask;
670
671 if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) &&
672 (desc->istate & IRQS_MASKED)) {
673 irq_compat_clr_masked(desc);
674 desc->istate &= ~IRQS_MASKED;
536 desc->irq_data.chip->irq_unmask(&desc->irq_data); 675 desc->irq_data.chip->irq_unmask(&desc->irq_data);
537 } 676 }
677out_unlock:
538 raw_spin_unlock_irq(&desc->lock); 678 raw_spin_unlock_irq(&desc->lock);
539 chip_bus_sync_unlock(desc); 679 chip_bus_sync_unlock(desc);
540} 680}
541 681
542#ifdef CONFIG_SMP 682#ifdef CONFIG_SMP
543/* 683/*
544 * Check whether we need to change the affinity of the interrupt thread. 684 * Check whether we need to chasnge the affinity of the interrupt thread.
545 */ 685 */
546static void 686static void
547irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) 687irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
@@ -573,6 +713,32 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
573#endif 713#endif
574 714
575/* 715/*
716 * Interrupts which are not explicitely requested as threaded
717 * interrupts rely on the implicit bh/preempt disable of the hard irq
718 * context. So we need to disable bh here to avoid deadlocks and other
719 * side effects.
720 */
721static void
722irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
723{
724 local_bh_disable();
725 action->thread_fn(action->irq, action->dev_id);
726 irq_finalize_oneshot(desc, action, false);
727 local_bh_enable();
728}
729
730/*
731 * Interrupts explicitely requested as threaded interupts want to be
732 * preemtible - many of them need to sleep and wait for slow busses to
733 * complete.
734 */
735static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
736{
737 action->thread_fn(action->irq, action->dev_id);
738 irq_finalize_oneshot(desc, action, false);
739}
740
741/*
576 * Interrupt handler thread 742 * Interrupt handler thread
577 */ 743 */
578static int irq_thread(void *data) 744static int irq_thread(void *data)
@@ -582,7 +748,14 @@ static int irq_thread(void *data)
582 }; 748 };
583 struct irqaction *action = data; 749 struct irqaction *action = data;
584 struct irq_desc *desc = irq_to_desc(action->irq); 750 struct irq_desc *desc = irq_to_desc(action->irq);
585 int wake, oneshot = desc->status & IRQ_ONESHOT; 751 void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
752 int wake;
753
754 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
755 &action->thread_flags))
756 handler_fn = irq_forced_thread_fn;
757 else
758 handler_fn = irq_thread_fn;
586 759
587 sched_setscheduler(current, SCHED_FIFO, &param); 760 sched_setscheduler(current, SCHED_FIFO, &param);
588 current->irqaction = action; 761 current->irqaction = action;
@@ -594,23 +767,20 @@ static int irq_thread(void *data)
594 atomic_inc(&desc->threads_active); 767 atomic_inc(&desc->threads_active);
595 768
596 raw_spin_lock_irq(&desc->lock); 769 raw_spin_lock_irq(&desc->lock);
597 if (unlikely(desc->status & IRQ_DISABLED)) { 770 if (unlikely(desc->istate & IRQS_DISABLED)) {
598 /* 771 /*
599 * CHECKME: We might need a dedicated 772 * CHECKME: We might need a dedicated
600 * IRQ_THREAD_PENDING flag here, which 773 * IRQ_THREAD_PENDING flag here, which
601 * retriggers the thread in check_irq_resend() 774 * retriggers the thread in check_irq_resend()
602 * but AFAICT IRQ_PENDING should be fine as it 775 * but AFAICT IRQS_PENDING should be fine as it
603 * retriggers the interrupt itself --- tglx 776 * retriggers the interrupt itself --- tglx
604 */ 777 */
605 desc->status |= IRQ_PENDING; 778 irq_compat_set_pending(desc);
779 desc->istate |= IRQS_PENDING;
606 raw_spin_unlock_irq(&desc->lock); 780 raw_spin_unlock_irq(&desc->lock);
607 } else { 781 } else {
608 raw_spin_unlock_irq(&desc->lock); 782 raw_spin_unlock_irq(&desc->lock);
609 783 handler_fn(desc, action);
610 action->thread_fn(action->irq, action->dev_id);
611
612 if (oneshot)
613 irq_finalize_oneshot(action->irq, desc);
614 } 784 }
615 785
616 wake = atomic_dec_and_test(&desc->threads_active); 786 wake = atomic_dec_and_test(&desc->threads_active);
@@ -619,6 +789,9 @@ static int irq_thread(void *data)
619 wake_up(&desc->wait_for_threads); 789 wake_up(&desc->wait_for_threads);
620 } 790 }
621 791
792 /* Prevent a stale desc->threads_oneshot */
793 irq_finalize_oneshot(desc, action, true);
794
622 /* 795 /*
623 * Clear irqaction. Otherwise exit_irq_thread() would make 796 * Clear irqaction. Otherwise exit_irq_thread() would make
624 * fuzz about an active irq thread going into nirvana. 797 * fuzz about an active irq thread going into nirvana.
@@ -633,6 +806,7 @@ static int irq_thread(void *data)
633void exit_irq_thread(void) 806void exit_irq_thread(void)
634{ 807{
635 struct task_struct *tsk = current; 808 struct task_struct *tsk = current;
809 struct irq_desc *desc;
636 810
637 if (!tsk->irqaction) 811 if (!tsk->irqaction)
638 return; 812 return;
@@ -641,6 +815,14 @@ void exit_irq_thread(void)
641 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 815 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
642 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); 816 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
643 817
818 desc = irq_to_desc(tsk->irqaction->irq);
819
820 /*
821 * Prevent a stale desc->threads_oneshot. Must be called
822 * before setting the IRQTF_DIED flag.
823 */
824 irq_finalize_oneshot(desc, tsk->irqaction, true);
825
644 /* 826 /*
645 * Set the THREAD DIED flag to prevent further wakeups of the 827 * Set the THREAD DIED flag to prevent further wakeups of the
646 * soon to be gone threaded handler. 828 * soon to be gone threaded handler.
@@ -648,6 +830,22 @@ void exit_irq_thread(void)
648 set_bit(IRQTF_DIED, &tsk->irqaction->flags); 830 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
649} 831}
650 832
833static void irq_setup_forced_threading(struct irqaction *new)
834{
835 if (!force_irqthreads)
836 return;
837 if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
838 return;
839
840 new->flags |= IRQF_ONESHOT;
841
842 if (!new->thread_fn) {
843 set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
844 new->thread_fn = new->handler;
845 new->handler = irq_default_primary_handler;
846 }
847}
848
651/* 849/*
652 * Internal function to register an irqaction - typically used to 850 * Internal function to register an irqaction - typically used to
653 * allocate special interrupts that are part of the architecture. 851 * allocate special interrupts that are part of the architecture.
@@ -657,9 +855,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
657{ 855{
658 struct irqaction *old, **old_ptr; 856 struct irqaction *old, **old_ptr;
659 const char *old_name = NULL; 857 const char *old_name = NULL;
660 unsigned long flags; 858 unsigned long flags, thread_mask = 0;
661 int nested, shared = 0; 859 int ret, nested, shared = 0;
662 int ret; 860 cpumask_var_t mask;
663 861
664 if (!desc) 862 if (!desc)
665 return -EINVAL; 863 return -EINVAL;
@@ -683,15 +881,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
683 rand_initialize_irq(irq); 881 rand_initialize_irq(irq);
684 } 882 }
685 883
686 /* Oneshot interrupts are not allowed with shared */
687 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
688 return -EINVAL;
689
690 /* 884 /*
691 * Check whether the interrupt nests into another interrupt 885 * Check whether the interrupt nests into another interrupt
692 * thread. 886 * thread.
693 */ 887 */
694 nested = desc->status & IRQ_NESTED_THREAD; 888 nested = irq_settings_is_nested_thread(desc);
695 if (nested) { 889 if (nested) {
696 if (!new->thread_fn) 890 if (!new->thread_fn)
697 return -EINVAL; 891 return -EINVAL;
@@ -701,6 +895,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
701 * dummy function which warns when called. 895 * dummy function which warns when called.
702 */ 896 */
703 new->handler = irq_nested_primary_handler; 897 new->handler = irq_nested_primary_handler;
898 } else {
899 irq_setup_forced_threading(new);
704 } 900 }
705 901
706 /* 902 /*
@@ -724,6 +920,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
724 new->thread = t; 920 new->thread = t;
725 } 921 }
726 922
923 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
924 ret = -ENOMEM;
925 goto out_thread;
926 }
927
727 /* 928 /*
728 * The following block of code has to be executed atomically 929 * The following block of code has to be executed atomically
729 */ 930 */
@@ -735,29 +936,40 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
735 * Can't share interrupts unless both agree to and are 936 * Can't share interrupts unless both agree to and are
736 * the same type (level, edge, polarity). So both flag 937 * the same type (level, edge, polarity). So both flag
737 * fields must have IRQF_SHARED set and the bits which 938 * fields must have IRQF_SHARED set and the bits which
738 * set the trigger type must match. 939 * set the trigger type must match. Also all must
940 * agree on ONESHOT.
739 */ 941 */
740 if (!((old->flags & new->flags) & IRQF_SHARED) || 942 if (!((old->flags & new->flags) & IRQF_SHARED) ||
741 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { 943 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
944 ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
742 old_name = old->name; 945 old_name = old->name;
743 goto mismatch; 946 goto mismatch;
744 } 947 }
745 948
746#if defined(CONFIG_IRQ_PER_CPU)
747 /* All handlers must agree on per-cpuness */ 949 /* All handlers must agree on per-cpuness */
748 if ((old->flags & IRQF_PERCPU) != 950 if ((old->flags & IRQF_PERCPU) !=
749 (new->flags & IRQF_PERCPU)) 951 (new->flags & IRQF_PERCPU))
750 goto mismatch; 952 goto mismatch;
751#endif
752 953
753 /* add new interrupt at end of irq queue */ 954 /* add new interrupt at end of irq queue */
754 do { 955 do {
956 thread_mask |= old->thread_mask;
755 old_ptr = &old->next; 957 old_ptr = &old->next;
756 old = *old_ptr; 958 old = *old_ptr;
757 } while (old); 959 } while (old);
758 shared = 1; 960 shared = 1;
759 } 961 }
760 962
963 /*
964 * Setup the thread mask for this irqaction. Unlikely to have
965 * 32 resp 64 irqs sharing one line, but who knows.
966 */
967 if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
968 ret = -EBUSY;
969 goto out_mask;
970 }
971 new->thread_mask = 1 << ffz(thread_mask);
972
761 if (!shared) { 973 if (!shared) {
762 irq_chip_set_defaults(desc->irq_data.chip); 974 irq_chip_set_defaults(desc->irq_data.chip);
763 975
@@ -769,42 +981,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
769 new->flags & IRQF_TRIGGER_MASK); 981 new->flags & IRQF_TRIGGER_MASK);
770 982
771 if (ret) 983 if (ret)
772 goto out_thread; 984 goto out_mask;
773 } else 985 }
774 compat_irq_chip_set_default_handler(desc);
775#if defined(CONFIG_IRQ_PER_CPU)
776 if (new->flags & IRQF_PERCPU)
777 desc->status |= IRQ_PER_CPU;
778#endif
779 986
780 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | 987 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
781 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 988 IRQS_INPROGRESS | IRQS_ONESHOT | \
989 IRQS_WAITING);
990
991 if (new->flags & IRQF_PERCPU) {
992 irqd_set(&desc->irq_data, IRQD_PER_CPU);
993 irq_settings_set_per_cpu(desc);
994 }
782 995
783 if (new->flags & IRQF_ONESHOT) 996 if (new->flags & IRQF_ONESHOT)
784 desc->status |= IRQ_ONESHOT; 997 desc->istate |= IRQS_ONESHOT;
785 998
786 if (!(desc->status & IRQ_NOAUTOEN)) { 999 if (irq_settings_can_autoenable(desc))
787 desc->depth = 0; 1000 irq_startup(desc);
788 desc->status &= ~IRQ_DISABLED; 1001 else
789 desc->irq_data.chip->irq_startup(&desc->irq_data);
790 } else
791 /* Undo nested disables: */ 1002 /* Undo nested disables: */
792 desc->depth = 1; 1003 desc->depth = 1;
793 1004
794 /* Exclude IRQ from balancing if requested */ 1005 /* Exclude IRQ from balancing if requested */
795 if (new->flags & IRQF_NOBALANCING) 1006 if (new->flags & IRQF_NOBALANCING) {
796 desc->status |= IRQ_NO_BALANCING; 1007 irq_settings_set_no_balancing(desc);
1008 irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
1009 }
797 1010
798 /* Set default affinity mask once everything is setup */ 1011 /* Set default affinity mask once everything is setup */
799 setup_affinity(irq, desc); 1012 setup_affinity(irq, desc, mask);
800 1013
801 } else if ((new->flags & IRQF_TRIGGER_MASK) 1014 } else if (new->flags & IRQF_TRIGGER_MASK) {
802 && (new->flags & IRQF_TRIGGER_MASK) 1015 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
803 != (desc->status & IRQ_TYPE_SENSE_MASK)) { 1016 unsigned int omsk = irq_settings_get_trigger_mask(desc);
804 /* hope the handler works with the actual trigger mode... */ 1017
805 pr_warning("IRQ %d uses trigger mode %d; requested %d\n", 1018 if (nmsk != omsk)
806 irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), 1019 /* hope the handler works with current trigger mode */
807 (int)(new->flags & IRQF_TRIGGER_MASK)); 1020 pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
1021 irq, nmsk, omsk);
808 } 1022 }
809 1023
810 new->irq = irq; 1024 new->irq = irq;
@@ -818,8 +1032,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
818 * Check whether we disabled the irq via the spurious handler 1032 * Check whether we disabled the irq via the spurious handler
819 * before. Reenable it and give it another chance. 1033 * before. Reenable it and give it another chance.
820 */ 1034 */
821 if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { 1035 if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
822 desc->status &= ~IRQ_SPURIOUS_DISABLED; 1036 desc->istate &= ~IRQS_SPURIOUS_DISABLED;
823 __enable_irq(desc, irq, false); 1037 __enable_irq(desc, irq, false);
824 } 1038 }
825 1039
@@ -849,6 +1063,9 @@ mismatch:
849#endif 1063#endif
850 ret = -EBUSY; 1064 ret = -EBUSY;
851 1065
1066out_mask:
1067 free_cpumask_var(mask);
1068
852out_thread: 1069out_thread:
853 raw_spin_unlock_irqrestore(&desc->lock, flags); 1070 raw_spin_unlock_irqrestore(&desc->lock, flags);
854 if (new->thread) { 1071 if (new->thread) {
@@ -871,9 +1088,14 @@ out_thread:
871 */ 1088 */
872int setup_irq(unsigned int irq, struct irqaction *act) 1089int setup_irq(unsigned int irq, struct irqaction *act)
873{ 1090{
1091 int retval;
874 struct irq_desc *desc = irq_to_desc(irq); 1092 struct irq_desc *desc = irq_to_desc(irq);
875 1093
876 return __setup_irq(irq, desc, act); 1094 chip_bus_lock(desc);
1095 retval = __setup_irq(irq, desc, act);
1096 chip_bus_sync_unlock(desc);
1097
1098 return retval;
877} 1099}
878EXPORT_SYMBOL_GPL(setup_irq); 1100EXPORT_SYMBOL_GPL(setup_irq);
879 1101
@@ -924,13 +1146,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
924#endif 1146#endif
925 1147
926 /* If this was the last handler, shut down the IRQ line: */ 1148 /* If this was the last handler, shut down the IRQ line: */
927 if (!desc->action) { 1149 if (!desc->action)
928 desc->status |= IRQ_DISABLED; 1150 irq_shutdown(desc);
929 if (desc->irq_data.chip->irq_shutdown)
930 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
931 else
932 desc->irq_data.chip->irq_disable(&desc->irq_data);
933 }
934 1151
935#ifdef CONFIG_SMP 1152#ifdef CONFIG_SMP
936 /* make sure affinity_hint is cleaned up */ 1153 /* make sure affinity_hint is cleaned up */
@@ -1004,6 +1221,11 @@ void free_irq(unsigned int irq, void *dev_id)
1004 if (!desc) 1221 if (!desc)
1005 return; 1222 return;
1006 1223
1224#ifdef CONFIG_SMP
1225 if (WARN_ON(desc->affinity_notify))
1226 desc->affinity_notify = NULL;
1227#endif
1228
1007 chip_bus_lock(desc); 1229 chip_bus_lock(desc);
1008 kfree(__free_irq(irq, dev_id)); 1230 kfree(__free_irq(irq, dev_id));
1009 chip_bus_sync_unlock(desc); 1231 chip_bus_sync_unlock(desc);
@@ -1074,7 +1296,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1074 if (!desc) 1296 if (!desc)
1075 return -EINVAL; 1297 return -EINVAL;
1076 1298
1077 if (desc->status & IRQ_NOREQUEST) 1299 if (!irq_settings_can_request(desc))
1078 return -EINVAL; 1300 return -EINVAL;
1079 1301
1080 if (!handler) { 1302 if (!handler) {
@@ -1149,7 +1371,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1149 if (!desc) 1371 if (!desc)
1150 return -EINVAL; 1372 return -EINVAL;
1151 1373
1152 if (desc->status & IRQ_NESTED_THREAD) { 1374 if (irq_settings_is_nested_thread(desc)) {
1153 ret = request_threaded_irq(irq, NULL, handler, 1375 ret = request_threaded_irq(irq, NULL, handler,
1154 flags, name, dev_id); 1376 flags, name, dev_id);
1155 return !ret ? IRQC_IS_NESTED : ret; 1377 return !ret ? IRQC_IS_NESTED : ret;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 441fd629ff04..ec4806d4778b 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,23 +4,23 @@
4 4
5#include "internals.h" 5#include "internals.h"
6 6
7void move_masked_irq(int irq) 7void irq_move_masked_irq(struct irq_data *idata)
8{ 8{
9 struct irq_desc *desc = irq_to_desc(irq); 9 struct irq_desc *desc = irq_data_to_desc(idata);
10 struct irq_chip *chip = desc->irq_data.chip; 10 struct irq_chip *chip = idata->chip;
11 11
12 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 12 if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
13 return; 13 return;
14 14
15 /* 15 /*
16 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. 16 * Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
17 */ 17 */
18 if (CHECK_IRQ_PER_CPU(desc->status)) { 18 if (!irqd_can_balance(&desc->irq_data)) {
19 WARN_ON(1); 19 WARN_ON(1);
20 return; 20 return;
21 } 21 }
22 22
23 desc->status &= ~IRQ_MOVE_PENDING; 23 irqd_clr_move_pending(&desc->irq_data);
24 24
25 if (unlikely(cpumask_empty(desc->pending_mask))) 25 if (unlikely(cpumask_empty(desc->pending_mask)))
26 return; 26 return;
@@ -53,15 +53,20 @@ void move_masked_irq(int irq)
53 cpumask_clear(desc->pending_mask); 53 cpumask_clear(desc->pending_mask);
54} 54}
55 55
56void move_native_irq(int irq) 56void move_masked_irq(int irq)
57{
58 irq_move_masked_irq(irq_get_irq_data(irq));
59}
60
61void irq_move_irq(struct irq_data *idata)
57{ 62{
58 struct irq_desc *desc = irq_to_desc(irq); 63 struct irq_desc *desc = irq_data_to_desc(idata);
59 bool masked; 64 bool masked;
60 65
61 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 66 if (likely(!irqd_is_setaffinity_pending(idata)))
62 return; 67 return;
63 68
64 if (unlikely(desc->status & IRQ_DISABLED)) 69 if (unlikely(desc->istate & IRQS_DISABLED))
65 return; 70 return;
66 71
67 /* 72 /*
@@ -69,10 +74,15 @@ void move_native_irq(int irq)
69 * threaded interrupt with ONESHOT set, we can end up with an 74 * threaded interrupt with ONESHOT set, we can end up with an
70 * interrupt storm. 75 * interrupt storm.
71 */ 76 */
72 masked = desc->status & IRQ_MASKED; 77 masked = desc->istate & IRQS_MASKED;
73 if (!masked) 78 if (!masked)
74 desc->irq_data.chip->irq_mask(&desc->irq_data); 79 idata->chip->irq_mask(idata);
75 move_masked_irq(irq); 80 irq_move_masked_irq(idata);
76 if (!masked) 81 if (!masked)
77 desc->irq_data.chip->irq_unmask(&desc->irq_data); 82 idata->chip->irq_unmask(idata);
83}
84
85void move_native_irq(int irq)
86{
87 irq_move_irq(irq_get_irq_data(irq));
78} 88}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d85b03..f76fc00c9877 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -18,7 +18,7 @@
18 * During system-wide suspend or hibernation device drivers need to be prevented 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * from receiving interrupts and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It marks all interrupt lines in use, except for the timer ones, as disabled 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * and sets the IRQ_SUSPENDED flag for each of them. 21 * and sets the IRQS_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
@@ -34,7 +34,7 @@ void suspend_device_irqs(void)
34 } 34 }
35 35
36 for_each_irq_desc(irq, desc) 36 for_each_irq_desc(irq, desc)
37 if (desc->status & IRQ_SUSPENDED) 37 if (desc->istate & IRQS_SUSPENDED)
38 synchronize_irq(irq); 38 synchronize_irq(irq);
39} 39}
40EXPORT_SYMBOL_GPL(suspend_device_irqs); 40EXPORT_SYMBOL_GPL(suspend_device_irqs);
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() 43 * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
44 * 44 *
45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that 45 * Enable all interrupt lines previously disabled by suspend_device_irqs() that
46 * have the IRQ_SUSPENDED flag set. 46 * have the IRQS_SUSPENDED flag set.
47 */ 47 */
48void resume_device_irqs(void) 48void resume_device_irqs(void)
49{ 49{
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
53 for_each_irq_desc(irq, desc) { 53 for_each_irq_desc(irq, desc) {
54 unsigned long flags; 54 unsigned long flags;
55 55
56 if (!(desc->status & IRQ_SUSPENDED))
57 continue;
58
59 raw_spin_lock_irqsave(&desc->lock, flags); 56 raw_spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true); 57 __enable_irq(desc, irq, true);
61 raw_spin_unlock_irqrestore(&desc->lock, flags); 58 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -71,9 +68,24 @@ int check_wakeup_irqs(void)
71 struct irq_desc *desc; 68 struct irq_desc *desc;
72 int irq; 69 int irq;
73 70
74 for_each_irq_desc(irq, desc) 71 for_each_irq_desc(irq, desc) {
75 if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) 72 if (irqd_is_wakeup_set(&desc->irq_data)) {
76 return -EBUSY; 73 if (desc->istate & IRQS_PENDING)
74 return -EBUSY;
75 continue;
76 }
77 /*
78 * Check the non wakeup interrupts whether they need
79 * to be masked before finally going into suspend
80 * state. That's for hardware which has no wakeup
81 * source configuration facility. The chip
82 * implementation indicates that with
83 * IRQCHIP_MASK_ON_SUSPEND.
84 */
85 if (desc->istate & IRQS_SUSPENDED &&
86 irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
87 mask_irq(desc);
88 }
77 89
78 return 0; 90 return 0;
79} 91}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c8a2a9f8a7b..4cc2e5ed0bec 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h>
14 15
15#include "internals.h" 16#include "internals.h"
16 17
@@ -24,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
24 const struct cpumask *mask = desc->irq_data.affinity; 25 const struct cpumask *mask = desc->irq_data.affinity;
25 26
26#ifdef CONFIG_GENERIC_PENDING_IRQ 27#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING) 28 if (irqd_is_setaffinity_pending(&desc->irq_data))
28 mask = desc->pending_mask; 29 mask = desc->pending_mask;
29#endif 30#endif
30 seq_cpumask(m, mask); 31 seq_cpumask(m, mask);
@@ -65,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
65 cpumask_var_t new_value; 66 cpumask_var_t new_value;
66 int err; 67 int err;
67 68
68 if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || 69 if (!irq_can_set_affinity(irq) || no_irq_affinity)
69 irq_balancing_disabled(irq))
70 return -EIO; 70 return -EIO;
71 71
72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 72 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
@@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
89 if (!cpumask_intersects(new_value, cpu_online_mask)) { 89 if (!cpumask_intersects(new_value, cpu_online_mask)) {
90 /* Special case for empty set - allow the architecture 90 /* Special case for empty set - allow the architecture
91 code to set default SMP affinity. */ 91 code to set default SMP affinity. */
92 err = irq_select_affinity_usr(irq) ? -EINVAL : count; 92 err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
93 } else { 93 } else {
94 irq_set_affinity(irq, new_value); 94 irq_set_affinity(irq, new_value);
95 err = count; 95 err = count;
@@ -357,3 +357,65 @@ void init_irq_proc(void)
357 } 357 }
358} 358}
359 359
360#ifdef CONFIG_GENERIC_IRQ_SHOW
361
362int __weak arch_show_interrupts(struct seq_file *p, int prec)
363{
364 return 0;
365}
366
367int show_interrupts(struct seq_file *p, void *v)
368{
369 static int prec;
370
371 unsigned long flags, any_count = 0;
372 int i = *(loff_t *) v, j;
373 struct irqaction *action;
374 struct irq_desc *desc;
375
376 if (i > nr_irqs)
377 return 0;
378
379 if (i == nr_irqs)
380 return arch_show_interrupts(p, prec);
381
382 /* print header and calculate the width of the first column */
383 if (i == 0) {
384 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
385 j *= 10;
386
387 seq_printf(p, "%*s", prec + 8, "");
388 for_each_online_cpu(j)
389 seq_printf(p, "CPU%-8d", j);
390 seq_putc(p, '\n');
391 }
392
393 desc = irq_to_desc(i);
394 if (!desc)
395 return 0;
396
397 raw_spin_lock_irqsave(&desc->lock, flags);
398 for_each_online_cpu(j)
399 any_count |= kstat_irqs_cpu(i, j);
400 action = desc->action;
401 if (!action && !any_count)
402 goto out;
403
404 seq_printf(p, "%*d: ", prec, i);
405 for_each_online_cpu(j)
406 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
407 seq_printf(p, " %8s", desc->irq_data.chip->name);
408 seq_printf(p, "-%-8s", desc->name);
409
410 if (action) {
411 seq_printf(p, " %s", action->name);
412 while ((action = action->next) != NULL)
413 seq_printf(p, ", %s", action->name);
414 }
415
416 seq_putc(p, '\n');
417out:
418 raw_spin_unlock_irqrestore(&desc->lock, flags);
419 return 0;
420}
421#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index dc49358b73fa..ad683a99b1ec 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -55,20 +55,19 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
55 */ 55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq) 56void check_irq_resend(struct irq_desc *desc, unsigned int irq)
57{ 57{
58 unsigned int status = desc->status;
59
60 /*
61 * Make sure the interrupt is enabled, before resending it:
62 */
63 desc->irq_data.chip->irq_enable(&desc->irq_data);
64
65 /* 58 /*
66 * We do not resend level type interrupts. Level type 59 * We do not resend level type interrupts. Level type
67 * interrupts are resent by hardware when they are still 60 * interrupts are resent by hardware when they are still
68 * active. 61 * active.
69 */ 62 */
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 63 if (irq_settings_is_level(desc))
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 64 return;
65 if (desc->istate & IRQS_REPLAY)
66 return;
67 if (desc->istate & IRQS_PENDING) {
68 irq_compat_clr_pending(desc);
69 desc->istate &= ~IRQS_PENDING;
70 desc->istate |= IRQS_REPLAY;
72 71
73 if (!desc->irq_data.chip->irq_retrigger || 72 if (!desc->irq_data.chip->irq_retrigger ||
74 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { 73 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 000000000000..0227ad358272
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,138 @@
1/*
2 * Internal header to deal with irq_desc->status which will be renamed
3 * to irq_desc->settings.
4 */
5enum {
6 _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS,
7 _IRQ_PER_CPU = IRQ_PER_CPU,
8 _IRQ_LEVEL = IRQ_LEVEL,
9 _IRQ_NOPROBE = IRQ_NOPROBE,
10 _IRQ_NOREQUEST = IRQ_NOREQUEST,
11 _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
12 _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
13 _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
14 _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
15 _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
16};
17
18#define IRQ_INPROGRESS GOT_YOU_MORON
19#define IRQ_REPLAY GOT_YOU_MORON
20#define IRQ_WAITING GOT_YOU_MORON
21#define IRQ_DISABLED GOT_YOU_MORON
22#define IRQ_PENDING GOT_YOU_MORON
23#define IRQ_MASKED GOT_YOU_MORON
24#define IRQ_WAKEUP GOT_YOU_MORON
25#define IRQ_MOVE_PENDING GOT_YOU_MORON
26#define IRQ_PER_CPU GOT_YOU_MORON
27#define IRQ_NO_BALANCING GOT_YOU_MORON
28#define IRQ_AFFINITY_SET GOT_YOU_MORON
29#define IRQ_LEVEL GOT_YOU_MORON
30#define IRQ_NOPROBE GOT_YOU_MORON
31#define IRQ_NOREQUEST GOT_YOU_MORON
32#define IRQ_NOAUTOEN GOT_YOU_MORON
33#define IRQ_NESTED_THREAD GOT_YOU_MORON
34#undef IRQF_MODIFY_MASK
35#define IRQF_MODIFY_MASK GOT_YOU_MORON
36
37static inline void
38irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
39{
40 desc->status &= ~(clr & _IRQF_MODIFY_MASK);
41 desc->status |= (set & _IRQF_MODIFY_MASK);
42}
43
44static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
45{
46 return desc->status & _IRQ_PER_CPU;
47}
48
49static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
50{
51 desc->status |= _IRQ_PER_CPU;
52}
53
54static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
55{
56 desc->status |= _IRQ_NO_BALANCING;
57}
58
59static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
60{
61 return desc->status & _IRQ_NO_BALANCING;
62}
63
64static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
65{
66 return desc->status & IRQ_TYPE_SENSE_MASK;
67}
68
69static inline void
70irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
71{
72 desc->status &= ~IRQ_TYPE_SENSE_MASK;
73 desc->status |= mask & IRQ_TYPE_SENSE_MASK;
74}
75
76static inline bool irq_settings_is_level(struct irq_desc *desc)
77{
78 return desc->status & _IRQ_LEVEL;
79}
80
81static inline void irq_settings_clr_level(struct irq_desc *desc)
82{
83 desc->status &= ~_IRQ_LEVEL;
84}
85
86static inline void irq_settings_set_level(struct irq_desc *desc)
87{
88 desc->status |= _IRQ_LEVEL;
89}
90
91static inline bool irq_settings_can_request(struct irq_desc *desc)
92{
93 return !(desc->status & _IRQ_NOREQUEST);
94}
95
96static inline void irq_settings_clr_norequest(struct irq_desc *desc)
97{
98 desc->status &= ~_IRQ_NOREQUEST;
99}
100
101static inline void irq_settings_set_norequest(struct irq_desc *desc)
102{
103 desc->status |= _IRQ_NOREQUEST;
104}
105
106static inline bool irq_settings_can_probe(struct irq_desc *desc)
107{
108 return !(desc->status & _IRQ_NOPROBE);
109}
110
111static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
112{
113 desc->status &= ~_IRQ_NOPROBE;
114}
115
116static inline void irq_settings_set_noprobe(struct irq_desc *desc)
117{
118 desc->status |= _IRQ_NOPROBE;
119}
120
121static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
122{
123 return desc->status & _IRQ_MOVE_PCNTXT;
124}
125
126static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
127{
128 return !(desc->status & _IRQ_NOAUTOEN);
129}
130
131static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
132{
133 return desc->status & _IRQ_NESTED_THREAD;
134}
135
136/* Nothing should touch desc->status from now on */
137#undef status
138#define status USE_THE_PROPER_WRAPPERS_YOU_MORON
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3089d3b9d5f3..dd586ebf9c8c 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -21,70 +21,94 @@ static int irqfixup __read_mostly;
21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) 21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
22static void poll_spurious_irqs(unsigned long dummy); 22static void poll_spurious_irqs(unsigned long dummy);
23static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); 23static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
24static int irq_poll_cpu;
25static atomic_t irq_poll_active;
26
27/*
28 * We wait here for a poller to finish.
29 *
30 * If the poll runs on this CPU, then we yell loudly and return
31 * false. That will leave the interrupt line disabled in the worst
32 * case, but it should never happen.
33 *
34 * We wait until the poller is done and then recheck disabled and
35 * action (about to be disabled). Only if it's still active, we return
36 * true and let the handler run.
37 */
38bool irq_wait_for_poll(struct irq_desc *desc)
39{
40 if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
41 "irq poll in progress on cpu %d for irq %d\n",
42 smp_processor_id(), desc->irq_data.irq))
43 return false;
44
45#ifdef CONFIG_SMP
46 do {
47 raw_spin_unlock(&desc->lock);
48 while (desc->istate & IRQS_INPROGRESS)
49 cpu_relax();
50 raw_spin_lock(&desc->lock);
51 } while (desc->istate & IRQS_INPROGRESS);
52 /* Might have been disabled in meantime */
53 return !(desc->istate & IRQS_DISABLED) && desc->action;
54#else
55 return false;
56#endif
57}
58
24 59
25/* 60/*
26 * Recovery handler for misrouted interrupts. 61 * Recovery handler for misrouted interrupts.
27 */ 62 */
28static int try_one_irq(int irq, struct irq_desc *desc) 63static int try_one_irq(int irq, struct irq_desc *desc, bool force)
29{ 64{
65 irqreturn_t ret = IRQ_NONE;
30 struct irqaction *action; 66 struct irqaction *action;
31 int ok = 0, work = 0;
32 67
33 raw_spin_lock(&desc->lock); 68 raw_spin_lock(&desc->lock);
34 /* Already running on another processor */
35 if (desc->status & IRQ_INPROGRESS) {
36 /*
37 * Already running: If it is shared get the other
38 * CPU to go looking for our mystery interrupt too
39 */
40 if (desc->action && (desc->action->flags & IRQF_SHARED))
41 desc->status |= IRQ_PENDING;
42 raw_spin_unlock(&desc->lock);
43 return ok;
44 }
45 /* Honour the normal IRQ locking */
46 desc->status |= IRQ_INPROGRESS;
47 action = desc->action;
48 raw_spin_unlock(&desc->lock);
49 69
50 while (action) { 70 /* PER_CPU and nested thread interrupts are never polled */
51 /* Only shared IRQ handlers are safe to call */ 71 if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
52 if (action->flags & IRQF_SHARED) { 72 goto out;
53 if (action->handler(irq, action->dev_id) ==
54 IRQ_HANDLED)
55 ok = 1;
56 }
57 action = action->next;
58 }
59 local_irq_disable();
60 /* Now clean up the flags */
61 raw_spin_lock(&desc->lock);
62 action = desc->action;
63 73
64 /* 74 /*
65 * While we were looking for a fixup someone queued a real 75 * Do not poll disabled interrupts unless the spurious
66 * IRQ clashing with our walk: 76 * disabled poller asks explicitely.
67 */ 77 */
68 while ((desc->status & IRQ_PENDING) && action) { 78 if ((desc->istate & IRQS_DISABLED) && !force)
79 goto out;
80
81 /*
82 * All handlers must agree on IRQF_SHARED, so we test just the
83 * first. Check for action->next as well.
84 */
85 action = desc->action;
86 if (!action || !(action->flags & IRQF_SHARED) ||
87 (action->flags & __IRQF_TIMER) || !action->next)
88 goto out;
89
90 /* Already running on another processor */
91 if (desc->istate & IRQS_INPROGRESS) {
69 /* 92 /*
70 * Perform real IRQ processing for the IRQ we deferred 93 * Already running: If it is shared get the other
94 * CPU to go looking for our mystery interrupt too
71 */ 95 */
72 work = 1; 96 irq_compat_set_pending(desc);
73 raw_spin_unlock(&desc->lock); 97 desc->istate |= IRQS_PENDING;
74 handle_IRQ_event(irq, action); 98 goto out;
75 raw_spin_lock(&desc->lock);
76 desc->status &= ~IRQ_PENDING;
77 } 99 }
78 desc->status &= ~IRQ_INPROGRESS;
79 /*
80 * If we did actual work for the real IRQ line we must let the
81 * IRQ controller clean up too
82 */
83 if (work)
84 irq_end(irq, desc);
85 raw_spin_unlock(&desc->lock);
86 100
87 return ok; 101 /* Mark it poll in progress */
102 desc->istate |= IRQS_POLL_INPROGRESS;
103 do {
104 if (handle_irq_event(desc) == IRQ_HANDLED)
105 ret = IRQ_HANDLED;
106 action = desc->action;
107 } while ((desc->istate & IRQS_PENDING) && action);
108 desc->istate &= ~IRQS_POLL_INPROGRESS;
109out:
110 raw_spin_unlock(&desc->lock);
111 return ret == IRQ_HANDLED;
88} 112}
89 113
90static int misrouted_irq(int irq) 114static int misrouted_irq(int irq)
@@ -92,6 +116,11 @@ static int misrouted_irq(int irq)
92 struct irq_desc *desc; 116 struct irq_desc *desc;
93 int i, ok = 0; 117 int i, ok = 0;
94 118
119 if (atomic_inc_return(&irq_poll_active) == 1)
120 goto out;
121
122 irq_poll_cpu = smp_processor_id();
123
95 for_each_irq_desc(i, desc) { 124 for_each_irq_desc(i, desc) {
96 if (!i) 125 if (!i)
97 continue; 126 continue;
@@ -99,9 +128,11 @@ static int misrouted_irq(int irq)
99 if (i == irq) /* Already tried */ 128 if (i == irq) /* Already tried */
100 continue; 129 continue;
101 130
102 if (try_one_irq(i, desc)) 131 if (try_one_irq(i, desc, false))
103 ok = 1; 132 ok = 1;
104 } 133 }
134out:
135 atomic_dec(&irq_poll_active);
105 /* So the caller can adjust the irq error counts */ 136 /* So the caller can adjust the irq error counts */
106 return ok; 137 return ok;
107} 138}
@@ -111,23 +142,28 @@ static void poll_spurious_irqs(unsigned long dummy)
111 struct irq_desc *desc; 142 struct irq_desc *desc;
112 int i; 143 int i;
113 144
145 if (atomic_inc_return(&irq_poll_active) != 1)
146 goto out;
147 irq_poll_cpu = smp_processor_id();
148
114 for_each_irq_desc(i, desc) { 149 for_each_irq_desc(i, desc) {
115 unsigned int status; 150 unsigned int state;
116 151
117 if (!i) 152 if (!i)
118 continue; 153 continue;
119 154
120 /* Racy but it doesn't matter */ 155 /* Racy but it doesn't matter */
121 status = desc->status; 156 state = desc->istate;
122 barrier(); 157 barrier();
123 if (!(status & IRQ_SPURIOUS_DISABLED)) 158 if (!(state & IRQS_SPURIOUS_DISABLED))
124 continue; 159 continue;
125 160
126 local_irq_disable(); 161 local_irq_disable();
127 try_one_irq(i, desc); 162 try_one_irq(i, desc, true);
128 local_irq_enable(); 163 local_irq_enable();
129 } 164 }
130 165out:
166 atomic_dec(&irq_poll_active);
131 mod_timer(&poll_spurious_irq_timer, 167 mod_timer(&poll_spurious_irq_timer,
132 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 168 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
133} 169}
@@ -139,15 +175,13 @@ static void poll_spurious_irqs(unsigned long dummy)
139 * 175 *
140 * (The other 100-of-100,000 interrupts may have been a correctly 176 * (The other 100-of-100,000 interrupts may have been a correctly
141 * functioning device sharing an IRQ with the failing one) 177 * functioning device sharing an IRQ with the failing one)
142 *
143 * Called under desc->lock
144 */ 178 */
145
146static void 179static void
147__report_bad_irq(unsigned int irq, struct irq_desc *desc, 180__report_bad_irq(unsigned int irq, struct irq_desc *desc,
148 irqreturn_t action_ret) 181 irqreturn_t action_ret)
149{ 182{
150 struct irqaction *action; 183 struct irqaction *action;
184 unsigned long flags;
151 185
152 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { 186 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
153 printk(KERN_ERR "irq event %d: bogus return value %x\n", 187 printk(KERN_ERR "irq event %d: bogus return value %x\n",
@@ -159,6 +193,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
159 dump_stack(); 193 dump_stack();
160 printk(KERN_ERR "handlers:\n"); 194 printk(KERN_ERR "handlers:\n");
161 195
196 /*
197 * We need to take desc->lock here. note_interrupt() is called
198 * w/o desc->lock held, but IRQ_PROGRESS set. We might race
199 * with something else removing an action. It's ok to take
200 * desc->lock here. See synchronize_irq().
201 */
202 raw_spin_lock_irqsave(&desc->lock, flags);
162 action = desc->action; 203 action = desc->action;
163 while (action) { 204 while (action) {
164 printk(KERN_ERR "[<%p>]", action->handler); 205 printk(KERN_ERR "[<%p>]", action->handler);
@@ -167,6 +208,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
167 printk("\n"); 208 printk("\n");
168 action = action->next; 209 action = action->next;
169 } 210 }
211 raw_spin_unlock_irqrestore(&desc->lock, flags);
170} 212}
171 213
172static void 214static void
@@ -218,6 +260,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
218void note_interrupt(unsigned int irq, struct irq_desc *desc, 260void note_interrupt(unsigned int irq, struct irq_desc *desc,
219 irqreturn_t action_ret) 261 irqreturn_t action_ret)
220{ 262{
263 if (desc->istate & IRQS_POLL_INPROGRESS)
264 return;
265
221 if (unlikely(action_ret != IRQ_HANDLED)) { 266 if (unlikely(action_ret != IRQ_HANDLED)) {
222 /* 267 /*
223 * If we are seeing only the odd spurious IRQ caused by 268 * If we are seeing only the odd spurious IRQ caused by
@@ -254,9 +299,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
254 * Now kill the IRQ 299 * Now kill the IRQ
255 */ 300 */
256 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 301 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
257 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 302 desc->istate |= IRQS_SPURIOUS_DISABLED;
258 desc->depth++; 303 desc->depth++;
259 desc->irq_data.chip->irq_disable(&desc->irq_data); 304 irq_disable(desc);
260 305
261 mod_timer(&poll_spurious_irq_timer, 306 mod_timer(&poll_spurious_irq_timer,
262 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 307 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 656222fcf767..ed253aa24ba4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,13 +38,96 @@
38 38
39#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
40 40
41struct remote_function_call {
42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
74 */
75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
84
85 if (task_curr(p))
86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
41enum event_type_t { 118enum event_type_t {
42 EVENT_FLEXIBLE = 0x1, 119 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2, 120 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45}; 122};
46 123
47atomic_t perf_task_events __read_mostly; 124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128atomic_t perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
48static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 132static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 133static atomic_t nr_task_events __read_mostly;
@@ -67,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
67/* 150/*
68 * max perf event sample rate 151 * max perf event sample rate
69 */ 152 */
70int sysctl_perf_event_sample_rate __read_mostly = 100000; 153#define DEFAULT_MAX_SAMPLE_RATE 100000
154int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
155static int max_samples_per_tick __read_mostly =
156 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
157
158int perf_proc_update_handler(struct ctl_table *table, int write,
159 void __user *buffer, size_t *lenp,
160 loff_t *ppos)
161{
162 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
163
164 if (ret || !write)
165 return ret;
166
167 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
168
169 return 0;
170}
71 171
72static atomic64_t perf_event_id; 172static atomic64_t perf_event_id;
73 173
@@ -75,7 +175,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type); 175 enum event_type_t event_type);
76 176
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 177static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type); 178 enum event_type_t event_type,
179 struct task_struct *task);
180
181static void update_context_time(struct perf_event_context *ctx);
182static u64 perf_event_time(struct perf_event *event);
79 183
80void __weak perf_event_print_debug(void) { } 184void __weak perf_event_print_debug(void) { }
81 185
@@ -89,6 +193,360 @@ static inline u64 perf_clock(void)
89 return local_clock(); 193 return local_clock();
90} 194}
91 195
196static inline struct perf_cpu_context *
197__get_cpu_context(struct perf_event_context *ctx)
198{
199 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
200}
201
202#ifdef CONFIG_CGROUP_PERF
203
204/*
205 * Must ensure cgroup is pinned (css_get) before calling
206 * this function. In other words, we cannot call this function
207 * if there is no cgroup event for the current CPU context.
208 */
209static inline struct perf_cgroup *
210perf_cgroup_from_task(struct task_struct *task)
211{
212 return container_of(task_subsys_state(task, perf_subsys_id),
213 struct perf_cgroup, css);
214}
215
216static inline bool
217perf_cgroup_match(struct perf_event *event)
218{
219 struct perf_event_context *ctx = event->ctx;
220 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
221
222 return !event->cgrp || event->cgrp == cpuctx->cgrp;
223}
224
225static inline void perf_get_cgroup(struct perf_event *event)
226{
227 css_get(&event->cgrp->css);
228}
229
230static inline void perf_put_cgroup(struct perf_event *event)
231{
232 css_put(&event->cgrp->css);
233}
234
235static inline void perf_detach_cgroup(struct perf_event *event)
236{
237 perf_put_cgroup(event);
238 event->cgrp = NULL;
239}
240
241static inline int is_cgroup_event(struct perf_event *event)
242{
243 return event->cgrp != NULL;
244}
245
246static inline u64 perf_cgroup_event_time(struct perf_event *event)
247{
248 struct perf_cgroup_info *t;
249
250 t = per_cpu_ptr(event->cgrp->info, event->cpu);
251 return t->time;
252}
253
254static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
255{
256 struct perf_cgroup_info *info;
257 u64 now;
258
259 now = perf_clock();
260
261 info = this_cpu_ptr(cgrp->info);
262
263 info->time += now - info->timestamp;
264 info->timestamp = now;
265}
266
267static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
268{
269 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
270 if (cgrp_out)
271 __update_cgrp_time(cgrp_out);
272}
273
274static inline void update_cgrp_time_from_event(struct perf_event *event)
275{
276 struct perf_cgroup *cgrp;
277
278 /*
279 * ensure we access cgroup data only when needed and
280 * when we know the cgroup is pinned (css_get)
281 */
282 if (!is_cgroup_event(event))
283 return;
284
285 cgrp = perf_cgroup_from_task(current);
286 /*
287 * Do not update time when cgroup is not active
288 */
289 if (cgrp == event->cgrp)
290 __update_cgrp_time(event->cgrp);
291}
292
293static inline void
294perf_cgroup_set_timestamp(struct task_struct *task,
295 struct perf_event_context *ctx)
296{
297 struct perf_cgroup *cgrp;
298 struct perf_cgroup_info *info;
299
300 /*
301 * ctx->lock held by caller
302 * ensure we do not access cgroup data
303 * unless we have the cgroup pinned (css_get)
304 */
305 if (!task || !ctx->nr_cgroups)
306 return;
307
308 cgrp = perf_cgroup_from_task(task);
309 info = this_cpu_ptr(cgrp->info);
310 info->timestamp = ctx->timestamp;
311}
312
313#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
314#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
315
316/*
317 * reschedule events based on the cgroup constraint of task.
318 *
319 * mode SWOUT : schedule out everything
320 * mode SWIN : schedule in based on cgroup for next
321 */
322void perf_cgroup_switch(struct task_struct *task, int mode)
323{
324 struct perf_cpu_context *cpuctx;
325 struct pmu *pmu;
326 unsigned long flags;
327
328 /*
329 * disable interrupts to avoid geting nr_cgroup
330 * changes via __perf_event_disable(). Also
331 * avoids preemption.
332 */
333 local_irq_save(flags);
334
335 /*
336 * we reschedule only in the presence of cgroup
337 * constrained events.
338 */
339 rcu_read_lock();
340
341 list_for_each_entry_rcu(pmu, &pmus, entry) {
342
343 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
344
345 perf_pmu_disable(cpuctx->ctx.pmu);
346
347 /*
348 * perf_cgroup_events says at least one
349 * context on this CPU has cgroup events.
350 *
351 * ctx->nr_cgroups reports the number of cgroup
352 * events for a context.
353 */
354 if (cpuctx->ctx.nr_cgroups > 0) {
355
356 if (mode & PERF_CGROUP_SWOUT) {
357 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
358 /*
359 * must not be done before ctxswout due
360 * to event_filter_match() in event_sched_out()
361 */
362 cpuctx->cgrp = NULL;
363 }
364
365 if (mode & PERF_CGROUP_SWIN) {
366 /* set cgrp before ctxsw in to
367 * allow event_filter_match() to not
368 * have to pass task around
369 */
370 cpuctx->cgrp = perf_cgroup_from_task(task);
371 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
372 }
373 }
374
375 perf_pmu_enable(cpuctx->ctx.pmu);
376 }
377
378 rcu_read_unlock();
379
380 local_irq_restore(flags);
381}
382
383static inline void perf_cgroup_sched_out(struct task_struct *task)
384{
385 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
386}
387
388static inline void perf_cgroup_sched_in(struct task_struct *task)
389{
390 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
391}
392
393static inline int perf_cgroup_connect(int fd, struct perf_event *event,
394 struct perf_event_attr *attr,
395 struct perf_event *group_leader)
396{
397 struct perf_cgroup *cgrp;
398 struct cgroup_subsys_state *css;
399 struct file *file;
400 int ret = 0, fput_needed;
401
402 file = fget_light(fd, &fput_needed);
403 if (!file)
404 return -EBADF;
405
406 css = cgroup_css_from_dir(file, perf_subsys_id);
407 if (IS_ERR(css)) {
408 ret = PTR_ERR(css);
409 goto out;
410 }
411
412 cgrp = container_of(css, struct perf_cgroup, css);
413 event->cgrp = cgrp;
414
415 /* must be done before we fput() the file */
416 perf_get_cgroup(event);
417
418 /*
419 * all events in a group must monitor
420 * the same cgroup because a task belongs
421 * to only one perf cgroup at a time
422 */
423 if (group_leader && group_leader->cgrp != cgrp) {
424 perf_detach_cgroup(event);
425 ret = -EINVAL;
426 }
427out:
428 fput_light(file, fput_needed);
429 return ret;
430}
431
432static inline void
433perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
434{
435 struct perf_cgroup_info *t;
436 t = per_cpu_ptr(event->cgrp->info, event->cpu);
437 event->shadow_ctx_time = now - t->timestamp;
438}
439
440static inline void
441perf_cgroup_defer_enabled(struct perf_event *event)
442{
443 /*
444 * when the current task's perf cgroup does not match
445 * the event's, we need to remember to call the
446 * perf_mark_enable() function the first time a task with
447 * a matching perf cgroup is scheduled in.
448 */
449 if (is_cgroup_event(event) && !perf_cgroup_match(event))
450 event->cgrp_defer_enabled = 1;
451}
452
453static inline void
454perf_cgroup_mark_enabled(struct perf_event *event,
455 struct perf_event_context *ctx)
456{
457 struct perf_event *sub;
458 u64 tstamp = perf_event_time(event);
459
460 if (!event->cgrp_defer_enabled)
461 return;
462
463 event->cgrp_defer_enabled = 0;
464
465 event->tstamp_enabled = tstamp - event->total_time_enabled;
466 list_for_each_entry(sub, &event->sibling_list, group_entry) {
467 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
468 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
469 sub->cgrp_defer_enabled = 0;
470 }
471 }
472}
473#else /* !CONFIG_CGROUP_PERF */
474
475static inline bool
476perf_cgroup_match(struct perf_event *event)
477{
478 return true;
479}
480
481static inline void perf_detach_cgroup(struct perf_event *event)
482{}
483
484static inline int is_cgroup_event(struct perf_event *event)
485{
486 return 0;
487}
488
489static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
490{
491 return 0;
492}
493
494static inline void update_cgrp_time_from_event(struct perf_event *event)
495{
496}
497
498static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
499{
500}
501
502static inline void perf_cgroup_sched_out(struct task_struct *task)
503{
504}
505
506static inline void perf_cgroup_sched_in(struct task_struct *task)
507{
508}
509
510static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
511 struct perf_event_attr *attr,
512 struct perf_event *group_leader)
513{
514 return -EINVAL;
515}
516
517static inline void
518perf_cgroup_set_timestamp(struct task_struct *task,
519 struct perf_event_context *ctx)
520{
521}
522
523void
524perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
525{
526}
527
528static inline void
529perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
530{
531}
532
533static inline u64 perf_cgroup_event_time(struct perf_event *event)
534{
535 return 0;
536}
537
538static inline void
539perf_cgroup_defer_enabled(struct perf_event *event)
540{
541}
542
543static inline void
544perf_cgroup_mark_enabled(struct perf_event *event,
545 struct perf_event_context *ctx)
546{
547}
548#endif
549
92void perf_pmu_disable(struct pmu *pmu) 550void perf_pmu_disable(struct pmu *pmu)
93{ 551{
94 int *count = this_cpu_ptr(pmu->pmu_disable_count); 552 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -254,7 +712,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
254 raw_spin_lock_irqsave(&ctx->lock, flags); 712 raw_spin_lock_irqsave(&ctx->lock, flags);
255 --ctx->pin_count; 713 --ctx->pin_count;
256 raw_spin_unlock_irqrestore(&ctx->lock, flags); 714 raw_spin_unlock_irqrestore(&ctx->lock, flags);
257 put_ctx(ctx);
258} 715}
259 716
260/* 717/*
@@ -271,6 +728,10 @@ static void update_context_time(struct perf_event_context *ctx)
271static u64 perf_event_time(struct perf_event *event) 728static u64 perf_event_time(struct perf_event *event)
272{ 729{
273 struct perf_event_context *ctx = event->ctx; 730 struct perf_event_context *ctx = event->ctx;
731
732 if (is_cgroup_event(event))
733 return perf_cgroup_event_time(event);
734
274 return ctx ? ctx->time : 0; 735 return ctx ? ctx->time : 0;
275} 736}
276 737
@@ -285,9 +746,20 @@ static void update_event_times(struct perf_event *event)
285 if (event->state < PERF_EVENT_STATE_INACTIVE || 746 if (event->state < PERF_EVENT_STATE_INACTIVE ||
286 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 747 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
287 return; 748 return;
288 749 /*
289 if (ctx->is_active) 750 * in cgroup mode, time_enabled represents
751 * the time the event was enabled AND active
752 * tasks were in the monitored cgroup. This is
753 * independent of the activity of the context as
754 * there may be a mix of cgroup and non-cgroup events.
755 *
756 * That is why we treat cgroup events differently
757 * here.
758 */
759 if (is_cgroup_event(event))
290 run_end = perf_event_time(event); 760 run_end = perf_event_time(event);
761 else if (ctx->is_active)
762 run_end = ctx->time;
291 else 763 else
292 run_end = event->tstamp_stopped; 764 run_end = event->tstamp_stopped;
293 765
@@ -299,6 +771,7 @@ static void update_event_times(struct perf_event *event)
299 run_end = perf_event_time(event); 771 run_end = perf_event_time(event);
300 772
301 event->total_time_running = run_end - event->tstamp_running; 773 event->total_time_running = run_end - event->tstamp_running;
774
302} 775}
303 776
304/* 777/*
@@ -347,6 +820,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
347 list_add_tail(&event->group_entry, list); 820 list_add_tail(&event->group_entry, list);
348 } 821 }
349 822
823 if (is_cgroup_event(event))
824 ctx->nr_cgroups++;
825
350 list_add_rcu(&event->event_entry, &ctx->event_list); 826 list_add_rcu(&event->event_entry, &ctx->event_list);
351 if (!ctx->nr_events) 827 if (!ctx->nr_events)
352 perf_pmu_rotate_start(ctx->pmu); 828 perf_pmu_rotate_start(ctx->pmu);
@@ -473,6 +949,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
473 949
474 event->attach_state &= ~PERF_ATTACH_CONTEXT; 950 event->attach_state &= ~PERF_ATTACH_CONTEXT;
475 951
952 if (is_cgroup_event(event))
953 ctx->nr_cgroups--;
954
476 ctx->nr_events--; 955 ctx->nr_events--;
477 if (event->attr.inherit_stat) 956 if (event->attr.inherit_stat)
478 ctx->nr_stat--; 957 ctx->nr_stat--;
@@ -544,7 +1023,8 @@ out:
544static inline int 1023static inline int
545event_filter_match(struct perf_event *event) 1024event_filter_match(struct perf_event *event)
546{ 1025{
547 return event->cpu == -1 || event->cpu == smp_processor_id(); 1026 return (event->cpu == -1 || event->cpu == smp_processor_id())
1027 && perf_cgroup_match(event);
548} 1028}
549 1029
550static void 1030static void
@@ -562,7 +1042,7 @@ event_sched_out(struct perf_event *event,
562 */ 1042 */
563 if (event->state == PERF_EVENT_STATE_INACTIVE 1043 if (event->state == PERF_EVENT_STATE_INACTIVE
564 && !event_filter_match(event)) { 1044 && !event_filter_match(event)) {
565 delta = ctx->time - event->tstamp_stopped; 1045 delta = tstamp - event->tstamp_stopped;
566 event->tstamp_running += delta; 1046 event->tstamp_running += delta;
567 event->tstamp_stopped = tstamp; 1047 event->tstamp_stopped = tstamp;
568 } 1048 }
@@ -606,47 +1086,30 @@ group_sched_out(struct perf_event *group_event,
606 cpuctx->exclusive = 0; 1086 cpuctx->exclusive = 0;
607} 1087}
608 1088
609static inline struct perf_cpu_context *
610__get_cpu_context(struct perf_event_context *ctx)
611{
612 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
613}
614
615/* 1089/*
616 * Cross CPU call to remove a performance event 1090 * Cross CPU call to remove a performance event
617 * 1091 *
618 * We disable the event on the hardware level first. After that we 1092 * We disable the event on the hardware level first. After that we
619 * remove it from the context list. 1093 * remove it from the context list.
620 */ 1094 */
621static void __perf_event_remove_from_context(void *info) 1095static int __perf_remove_from_context(void *info)
622{ 1096{
623 struct perf_event *event = info; 1097 struct perf_event *event = info;
624 struct perf_event_context *ctx = event->ctx; 1098 struct perf_event_context *ctx = event->ctx;
625 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1099 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
626 1100
627 /*
628 * If this is a task context, we need to check whether it is
629 * the current task context of this cpu. If not it has been
630 * scheduled out before the smp call arrived.
631 */
632 if (ctx->task && cpuctx->task_ctx != ctx)
633 return;
634
635 raw_spin_lock(&ctx->lock); 1101 raw_spin_lock(&ctx->lock);
636
637 event_sched_out(event, cpuctx, ctx); 1102 event_sched_out(event, cpuctx, ctx);
638
639 list_del_event(event, ctx); 1103 list_del_event(event, ctx);
640
641 raw_spin_unlock(&ctx->lock); 1104 raw_spin_unlock(&ctx->lock);
1105
1106 return 0;
642} 1107}
643 1108
644 1109
645/* 1110/*
646 * Remove the event from a task's (or a CPU's) list of events. 1111 * Remove the event from a task's (or a CPU's) list of events.
647 * 1112 *
648 * Must be called with ctx->mutex held.
649 *
650 * CPU events are removed with a smp call. For task events we only 1113 * CPU events are removed with a smp call. For task events we only
651 * call when the task is on a CPU. 1114 * call when the task is on a CPU.
652 * 1115 *
@@ -657,49 +1120,48 @@ static void __perf_event_remove_from_context(void *info)
657 * When called from perf_event_exit_task, it's OK because the 1120 * When called from perf_event_exit_task, it's OK because the
658 * context has been detached from its task. 1121 * context has been detached from its task.
659 */ 1122 */
660static void perf_event_remove_from_context(struct perf_event *event) 1123static void perf_remove_from_context(struct perf_event *event)
661{ 1124{
662 struct perf_event_context *ctx = event->ctx; 1125 struct perf_event_context *ctx = event->ctx;
663 struct task_struct *task = ctx->task; 1126 struct task_struct *task = ctx->task;
664 1127
1128 lockdep_assert_held(&ctx->mutex);
1129
665 if (!task) { 1130 if (!task) {
666 /* 1131 /*
667 * Per cpu events are removed via an smp call and 1132 * Per cpu events are removed via an smp call and
668 * the removal is always successful. 1133 * the removal is always successful.
669 */ 1134 */
670 smp_call_function_single(event->cpu, 1135 cpu_function_call(event->cpu, __perf_remove_from_context, event);
671 __perf_event_remove_from_context,
672 event, 1);
673 return; 1136 return;
674 } 1137 }
675 1138
676retry: 1139retry:
677 task_oncpu_function_call(task, __perf_event_remove_from_context, 1140 if (!task_function_call(task, __perf_remove_from_context, event))
678 event); 1141 return;
679 1142
680 raw_spin_lock_irq(&ctx->lock); 1143 raw_spin_lock_irq(&ctx->lock);
681 /* 1144 /*
682 * If the context is active we need to retry the smp call. 1145 * If we failed to find a running task, but find the context active now
1146 * that we've acquired the ctx->lock, retry.
683 */ 1147 */
684 if (ctx->nr_active && !list_empty(&event->group_entry)) { 1148 if (ctx->is_active) {
685 raw_spin_unlock_irq(&ctx->lock); 1149 raw_spin_unlock_irq(&ctx->lock);
686 goto retry; 1150 goto retry;
687 } 1151 }
688 1152
689 /* 1153 /*
690 * The lock prevents that this context is scheduled in so we 1154 * Since the task isn't running, its safe to remove the event, us
691 * can remove the event safely, if the call above did not 1155 * holding the ctx->lock ensures the task won't get scheduled in.
692 * succeed.
693 */ 1156 */
694 if (!list_empty(&event->group_entry)) 1157 list_del_event(event, ctx);
695 list_del_event(event, ctx);
696 raw_spin_unlock_irq(&ctx->lock); 1158 raw_spin_unlock_irq(&ctx->lock);
697} 1159}
698 1160
699/* 1161/*
700 * Cross CPU call to disable a performance event 1162 * Cross CPU call to disable a performance event
701 */ 1163 */
702static void __perf_event_disable(void *info) 1164static int __perf_event_disable(void *info)
703{ 1165{
704 struct perf_event *event = info; 1166 struct perf_event *event = info;
705 struct perf_event_context *ctx = event->ctx; 1167 struct perf_event_context *ctx = event->ctx;
@@ -708,9 +1170,12 @@ static void __perf_event_disable(void *info)
708 /* 1170 /*
709 * If this is a per-task event, need to check whether this 1171 * If this is a per-task event, need to check whether this
710 * event's task is the current task on this cpu. 1172 * event's task is the current task on this cpu.
1173 *
1174 * Can trigger due to concurrent perf_event_context_sched_out()
1175 * flipping contexts around.
711 */ 1176 */
712 if (ctx->task && cpuctx->task_ctx != ctx) 1177 if (ctx->task && cpuctx->task_ctx != ctx)
713 return; 1178 return -EINVAL;
714 1179
715 raw_spin_lock(&ctx->lock); 1180 raw_spin_lock(&ctx->lock);
716 1181
@@ -720,6 +1185,7 @@ static void __perf_event_disable(void *info)
720 */ 1185 */
721 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1186 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
722 update_context_time(ctx); 1187 update_context_time(ctx);
1188 update_cgrp_time_from_event(event);
723 update_group_times(event); 1189 update_group_times(event);
724 if (event == event->group_leader) 1190 if (event == event->group_leader)
725 group_sched_out(event, cpuctx, ctx); 1191 group_sched_out(event, cpuctx, ctx);
@@ -729,6 +1195,8 @@ static void __perf_event_disable(void *info)
729 } 1195 }
730 1196
731 raw_spin_unlock(&ctx->lock); 1197 raw_spin_unlock(&ctx->lock);
1198
1199 return 0;
732} 1200}
733 1201
734/* 1202/*
@@ -753,13 +1221,13 @@ void perf_event_disable(struct perf_event *event)
753 /* 1221 /*
754 * Disable the event on the cpu that it's on 1222 * Disable the event on the cpu that it's on
755 */ 1223 */
756 smp_call_function_single(event->cpu, __perf_event_disable, 1224 cpu_function_call(event->cpu, __perf_event_disable, event);
757 event, 1);
758 return; 1225 return;
759 } 1226 }
760 1227
761retry: 1228retry:
762 task_oncpu_function_call(task, __perf_event_disable, event); 1229 if (!task_function_call(task, __perf_event_disable, event))
1230 return;
763 1231
764 raw_spin_lock_irq(&ctx->lock); 1232 raw_spin_lock_irq(&ctx->lock);
765 /* 1233 /*
@@ -767,6 +1235,11 @@ retry:
767 */ 1235 */
768 if (event->state == PERF_EVENT_STATE_ACTIVE) { 1236 if (event->state == PERF_EVENT_STATE_ACTIVE) {
769 raw_spin_unlock_irq(&ctx->lock); 1237 raw_spin_unlock_irq(&ctx->lock);
1238 /*
1239 * Reload the task pointer, it might have been changed by
1240 * a concurrent perf_event_context_sched_out().
1241 */
1242 task = ctx->task;
770 goto retry; 1243 goto retry;
771 } 1244 }
772 1245
@@ -778,10 +1251,44 @@ retry:
778 update_group_times(event); 1251 update_group_times(event);
779 event->state = PERF_EVENT_STATE_OFF; 1252 event->state = PERF_EVENT_STATE_OFF;
780 } 1253 }
781
782 raw_spin_unlock_irq(&ctx->lock); 1254 raw_spin_unlock_irq(&ctx->lock);
783} 1255}
784 1256
1257static void perf_set_shadow_time(struct perf_event *event,
1258 struct perf_event_context *ctx,
1259 u64 tstamp)
1260{
1261 /*
1262 * use the correct time source for the time snapshot
1263 *
1264 * We could get by without this by leveraging the
1265 * fact that to get to this function, the caller
1266 * has most likely already called update_context_time()
1267 * and update_cgrp_time_xx() and thus both timestamp
1268 * are identical (or very close). Given that tstamp is,
1269 * already adjusted for cgroup, we could say that:
1270 * tstamp - ctx->timestamp
1271 * is equivalent to
1272 * tstamp - cgrp->timestamp.
1273 *
1274 * Then, in perf_output_read(), the calculation would
1275 * work with no changes because:
1276 * - event is guaranteed scheduled in
1277 * - no scheduled out in between
1278 * - thus the timestamp would be the same
1279 *
1280 * But this is a bit hairy.
1281 *
1282 * So instead, we have an explicit cgroup call to remain
1283 * within the time time source all along. We believe it
1284 * is cleaner and simpler to understand.
1285 */
1286 if (is_cgroup_event(event))
1287 perf_cgroup_set_shadow_time(event, tstamp);
1288 else
1289 event->shadow_ctx_time = tstamp - ctx->timestamp;
1290}
1291
785#define MAX_INTERRUPTS (~0ULL) 1292#define MAX_INTERRUPTS (~0ULL)
786 1293
787static void perf_log_throttle(struct perf_event *event, int enable); 1294static void perf_log_throttle(struct perf_event *event, int enable);
@@ -822,7 +1329,7 @@ event_sched_in(struct perf_event *event,
822 1329
823 event->tstamp_running += tstamp - event->tstamp_stopped; 1330 event->tstamp_running += tstamp - event->tstamp_stopped;
824 1331
825 event->shadow_ctx_time = tstamp - ctx->timestamp; 1332 perf_set_shadow_time(event, ctx, tstamp);
826 1333
827 if (!is_software_event(event)) 1334 if (!is_software_event(event))
828 cpuctx->active_oncpu++; 1335 cpuctx->active_oncpu++;
@@ -943,12 +1450,15 @@ static void add_event_to_ctx(struct perf_event *event,
943 event->tstamp_stopped = tstamp; 1450 event->tstamp_stopped = tstamp;
944} 1451}
945 1452
1453static void perf_event_context_sched_in(struct perf_event_context *ctx,
1454 struct task_struct *tsk);
1455
946/* 1456/*
947 * Cross CPU call to install and enable a performance event 1457 * Cross CPU call to install and enable a performance event
948 * 1458 *
949 * Must be called with ctx->mutex held 1459 * Must be called with ctx->mutex held
950 */ 1460 */
951static void __perf_install_in_context(void *info) 1461static int __perf_install_in_context(void *info)
952{ 1462{
953 struct perf_event *event = info; 1463 struct perf_event *event = info;
954 struct perf_event_context *ctx = event->ctx; 1464 struct perf_event_context *ctx = event->ctx;
@@ -957,21 +1467,22 @@ static void __perf_install_in_context(void *info)
957 int err; 1467 int err;
958 1468
959 /* 1469 /*
960 * If this is a task context, we need to check whether it is 1470 * In case we're installing a new context to an already running task,
961 * the current task context of this cpu. If not it has been 1471 * could also happen before perf_event_task_sched_in() on architectures
962 * scheduled out before the smp call arrived. 1472 * which do context switches with IRQs enabled.
963 * Or possibly this is the right context but it isn't
964 * on this cpu because it had no events.
965 */ 1473 */
966 if (ctx->task && cpuctx->task_ctx != ctx) { 1474 if (ctx->task && !cpuctx->task_ctx)
967 if (cpuctx->task_ctx || ctx->task != current) 1475 perf_event_context_sched_in(ctx, ctx->task);
968 return;
969 cpuctx->task_ctx = ctx;
970 }
971 1476
972 raw_spin_lock(&ctx->lock); 1477 raw_spin_lock(&ctx->lock);
973 ctx->is_active = 1; 1478 ctx->is_active = 1;
974 update_context_time(ctx); 1479 update_context_time(ctx);
1480 /*
1481 * update cgrp time only if current cgrp
1482 * matches event->cgrp. Must be done before
1483 * calling add_event_to_ctx()
1484 */
1485 update_cgrp_time_from_event(event);
975 1486
976 add_event_to_ctx(event, ctx); 1487 add_event_to_ctx(event, ctx);
977 1488
@@ -1012,6 +1523,8 @@ static void __perf_install_in_context(void *info)
1012 1523
1013unlock: 1524unlock:
1014 raw_spin_unlock(&ctx->lock); 1525 raw_spin_unlock(&ctx->lock);
1526
1527 return 0;
1015} 1528}
1016 1529
1017/* 1530/*
@@ -1023,8 +1536,6 @@ unlock:
1023 * If the event is attached to a task which is on a CPU we use a smp 1536 * If the event is attached to a task which is on a CPU we use a smp
1024 * call to enable it in the task context. The task might have been 1537 * call to enable it in the task context. The task might have been
1025 * scheduled away, but we check this in the smp call again. 1538 * scheduled away, but we check this in the smp call again.
1026 *
1027 * Must be called with ctx->mutex held.
1028 */ 1539 */
1029static void 1540static void
1030perf_install_in_context(struct perf_event_context *ctx, 1541perf_install_in_context(struct perf_event_context *ctx,
@@ -1033,6 +1544,8 @@ perf_install_in_context(struct perf_event_context *ctx,
1033{ 1544{
1034 struct task_struct *task = ctx->task; 1545 struct task_struct *task = ctx->task;
1035 1546
1547 lockdep_assert_held(&ctx->mutex);
1548
1036 event->ctx = ctx; 1549 event->ctx = ctx;
1037 1550
1038 if (!task) { 1551 if (!task) {
@@ -1040,31 +1553,29 @@ perf_install_in_context(struct perf_event_context *ctx,
1040 * Per cpu events are installed via an smp call and 1553 * Per cpu events are installed via an smp call and
1041 * the install is always successful. 1554 * the install is always successful.
1042 */ 1555 */
1043 smp_call_function_single(cpu, __perf_install_in_context, 1556 cpu_function_call(cpu, __perf_install_in_context, event);
1044 event, 1);
1045 return; 1557 return;
1046 } 1558 }
1047 1559
1048retry: 1560retry:
1049 task_oncpu_function_call(task, __perf_install_in_context, 1561 if (!task_function_call(task, __perf_install_in_context, event))
1050 event); 1562 return;
1051 1563
1052 raw_spin_lock_irq(&ctx->lock); 1564 raw_spin_lock_irq(&ctx->lock);
1053 /* 1565 /*
1054 * we need to retry the smp call. 1566 * If we failed to find a running task, but find the context active now
1567 * that we've acquired the ctx->lock, retry.
1055 */ 1568 */
1056 if (ctx->is_active && list_empty(&event->group_entry)) { 1569 if (ctx->is_active) {
1057 raw_spin_unlock_irq(&ctx->lock); 1570 raw_spin_unlock_irq(&ctx->lock);
1058 goto retry; 1571 goto retry;
1059 } 1572 }
1060 1573
1061 /* 1574 /*
1062 * The lock prevents that this context is scheduled in so we 1575 * Since the task isn't running, its safe to add the event, us holding
1063 * can add the event safely, if it the call above did not 1576 * the ctx->lock ensures the task won't get scheduled in.
1064 * succeed.
1065 */ 1577 */
1066 if (list_empty(&event->group_entry)) 1578 add_event_to_ctx(event, ctx);
1067 add_event_to_ctx(event, ctx);
1068 raw_spin_unlock_irq(&ctx->lock); 1579 raw_spin_unlock_irq(&ctx->lock);
1069} 1580}
1070 1581
@@ -1093,7 +1604,7 @@ static void __perf_event_mark_enabled(struct perf_event *event,
1093/* 1604/*
1094 * Cross CPU call to enable a performance event 1605 * Cross CPU call to enable a performance event
1095 */ 1606 */
1096static void __perf_event_enable(void *info) 1607static int __perf_event_enable(void *info)
1097{ 1608{
1098 struct perf_event *event = info; 1609 struct perf_event *event = info;
1099 struct perf_event_context *ctx = event->ctx; 1610 struct perf_event_context *ctx = event->ctx;
@@ -1101,26 +1612,27 @@ static void __perf_event_enable(void *info)
1101 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1612 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1102 int err; 1613 int err;
1103 1614
1104 /* 1615 if (WARN_ON_ONCE(!ctx->is_active))
1105 * If this is a per-task event, need to check whether this 1616 return -EINVAL;
1106 * event's task is the current task on this cpu.
1107 */
1108 if (ctx->task && cpuctx->task_ctx != ctx) {
1109 if (cpuctx->task_ctx || ctx->task != current)
1110 return;
1111 cpuctx->task_ctx = ctx;
1112 }
1113 1617
1114 raw_spin_lock(&ctx->lock); 1618 raw_spin_lock(&ctx->lock);
1115 ctx->is_active = 1;
1116 update_context_time(ctx); 1619 update_context_time(ctx);
1117 1620
1118 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1621 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1119 goto unlock; 1622 goto unlock;
1623
1624 /*
1625 * set current task's cgroup time reference point
1626 */
1627 perf_cgroup_set_timestamp(current, ctx);
1628
1120 __perf_event_mark_enabled(event, ctx); 1629 __perf_event_mark_enabled(event, ctx);
1121 1630
1122 if (!event_filter_match(event)) 1631 if (!event_filter_match(event)) {
1632 if (is_cgroup_event(event))
1633 perf_cgroup_defer_enabled(event);
1123 goto unlock; 1634 goto unlock;
1635 }
1124 1636
1125 /* 1637 /*
1126 * If the event is in a group and isn't the group leader, 1638 * If the event is in a group and isn't the group leader,
@@ -1153,6 +1665,8 @@ static void __perf_event_enable(void *info)
1153 1665
1154unlock: 1666unlock:
1155 raw_spin_unlock(&ctx->lock); 1667 raw_spin_unlock(&ctx->lock);
1668
1669 return 0;
1156} 1670}
1157 1671
1158/* 1672/*
@@ -1173,8 +1687,7 @@ void perf_event_enable(struct perf_event *event)
1173 /* 1687 /*
1174 * Enable the event on the cpu that it's on 1688 * Enable the event on the cpu that it's on
1175 */ 1689 */
1176 smp_call_function_single(event->cpu, __perf_event_enable, 1690 cpu_function_call(event->cpu, __perf_event_enable, event);
1177 event, 1);
1178 return; 1691 return;
1179 } 1692 }
1180 1693
@@ -1193,8 +1706,15 @@ void perf_event_enable(struct perf_event *event)
1193 event->state = PERF_EVENT_STATE_OFF; 1706 event->state = PERF_EVENT_STATE_OFF;
1194 1707
1195retry: 1708retry:
1709 if (!ctx->is_active) {
1710 __perf_event_mark_enabled(event, ctx);
1711 goto out;
1712 }
1713
1196 raw_spin_unlock_irq(&ctx->lock); 1714 raw_spin_unlock_irq(&ctx->lock);
1197 task_oncpu_function_call(task, __perf_event_enable, event); 1715
1716 if (!task_function_call(task, __perf_event_enable, event))
1717 return;
1198 1718
1199 raw_spin_lock_irq(&ctx->lock); 1719 raw_spin_lock_irq(&ctx->lock);
1200 1720
@@ -1202,15 +1722,14 @@ retry:
1202 * If the context is active and the event is still off, 1722 * If the context is active and the event is still off,
1203 * we need to retry the cross-call. 1723 * we need to retry the cross-call.
1204 */ 1724 */
1205 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) 1725 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1726 /*
1727 * task could have been flipped by a concurrent
1728 * perf_event_context_sched_out()
1729 */
1730 task = ctx->task;
1206 goto retry; 1731 goto retry;
1207 1732 }
1208 /*
1209 * Since we have the lock this context can't be scheduled
1210 * in, so we can change the state safely.
1211 */
1212 if (event->state == PERF_EVENT_STATE_OFF)
1213 __perf_event_mark_enabled(event, ctx);
1214 1733
1215out: 1734out:
1216 raw_spin_unlock_irq(&ctx->lock); 1735 raw_spin_unlock_irq(&ctx->lock);
@@ -1242,6 +1761,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1242 if (likely(!ctx->nr_events)) 1761 if (likely(!ctx->nr_events))
1243 goto out; 1762 goto out;
1244 update_context_time(ctx); 1763 update_context_time(ctx);
1764 update_cgrp_time_from_cpuctx(cpuctx);
1245 1765
1246 if (!ctx->nr_active) 1766 if (!ctx->nr_active)
1247 goto out; 1767 goto out;
@@ -1354,8 +1874,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1354 } 1874 }
1355} 1875}
1356 1876
1357void perf_event_context_sched_out(struct task_struct *task, int ctxn, 1877static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1358 struct task_struct *next) 1878 struct task_struct *next)
1359{ 1879{
1360 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 1880 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1361 struct perf_event_context *next_ctx; 1881 struct perf_event_context *next_ctx;
@@ -1431,6 +1951,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
1431 1951
1432 for_each_task_context_nr(ctxn) 1952 for_each_task_context_nr(ctxn)
1433 perf_event_context_sched_out(task, ctxn, next); 1953 perf_event_context_sched_out(task, ctxn, next);
1954
1955 /*
1956 * if cgroup events exist on this CPU, then we need
1957 * to check if we have to switch out PMU state.
1958 * cgroup event are system-wide mode only
1959 */
1960 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1961 perf_cgroup_sched_out(task);
1434} 1962}
1435 1963
1436static void task_ctx_sched_out(struct perf_event_context *ctx, 1964static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1469,6 +1997,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1469 if (!event_filter_match(event)) 1997 if (!event_filter_match(event))
1470 continue; 1998 continue;
1471 1999
2000 /* may need to reset tstamp_enabled */
2001 if (is_cgroup_event(event))
2002 perf_cgroup_mark_enabled(event, ctx);
2003
1472 if (group_can_go_on(event, cpuctx, 1)) 2004 if (group_can_go_on(event, cpuctx, 1))
1473 group_sched_in(event, cpuctx, ctx); 2005 group_sched_in(event, cpuctx, ctx);
1474 2006
@@ -1501,6 +2033,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1501 if (!event_filter_match(event)) 2033 if (!event_filter_match(event))
1502 continue; 2034 continue;
1503 2035
2036 /* may need to reset tstamp_enabled */
2037 if (is_cgroup_event(event))
2038 perf_cgroup_mark_enabled(event, ctx);
2039
1504 if (group_can_go_on(event, cpuctx, can_add_hw)) { 2040 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1505 if (group_sched_in(event, cpuctx, ctx)) 2041 if (group_sched_in(event, cpuctx, ctx))
1506 can_add_hw = 0; 2042 can_add_hw = 0;
@@ -1511,15 +2047,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1511static void 2047static void
1512ctx_sched_in(struct perf_event_context *ctx, 2048ctx_sched_in(struct perf_event_context *ctx,
1513 struct perf_cpu_context *cpuctx, 2049 struct perf_cpu_context *cpuctx,
1514 enum event_type_t event_type) 2050 enum event_type_t event_type,
2051 struct task_struct *task)
1515{ 2052{
2053 u64 now;
2054
1516 raw_spin_lock(&ctx->lock); 2055 raw_spin_lock(&ctx->lock);
1517 ctx->is_active = 1; 2056 ctx->is_active = 1;
1518 if (likely(!ctx->nr_events)) 2057 if (likely(!ctx->nr_events))
1519 goto out; 2058 goto out;
1520 2059
1521 ctx->timestamp = perf_clock(); 2060 now = perf_clock();
1522 2061 ctx->timestamp = now;
2062 perf_cgroup_set_timestamp(task, ctx);
1523 /* 2063 /*
1524 * First go through the list and put on any pinned groups 2064 * First go through the list and put on any pinned groups
1525 * in order to give them the best chance of going on. 2065 * in order to give them the best chance of going on.
@@ -1536,11 +2076,12 @@ out:
1536} 2076}
1537 2077
1538static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2078static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1539 enum event_type_t event_type) 2079 enum event_type_t event_type,
2080 struct task_struct *task)
1540{ 2081{
1541 struct perf_event_context *ctx = &cpuctx->ctx; 2082 struct perf_event_context *ctx = &cpuctx->ctx;
1542 2083
1543 ctx_sched_in(ctx, cpuctx, event_type); 2084 ctx_sched_in(ctx, cpuctx, event_type, task);
1544} 2085}
1545 2086
1546static void task_ctx_sched_in(struct perf_event_context *ctx, 2087static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1548,15 +2089,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
1548{ 2089{
1549 struct perf_cpu_context *cpuctx; 2090 struct perf_cpu_context *cpuctx;
1550 2091
1551 cpuctx = __get_cpu_context(ctx); 2092 cpuctx = __get_cpu_context(ctx);
1552 if (cpuctx->task_ctx == ctx) 2093 if (cpuctx->task_ctx == ctx)
1553 return; 2094 return;
1554 2095
1555 ctx_sched_in(ctx, cpuctx, event_type); 2096 ctx_sched_in(ctx, cpuctx, event_type, NULL);
1556 cpuctx->task_ctx = ctx; 2097 cpuctx->task_ctx = ctx;
1557} 2098}
1558 2099
1559void perf_event_context_sched_in(struct perf_event_context *ctx) 2100static void perf_event_context_sched_in(struct perf_event_context *ctx,
2101 struct task_struct *task)
1560{ 2102{
1561 struct perf_cpu_context *cpuctx; 2103 struct perf_cpu_context *cpuctx;
1562 2104
@@ -1572,9 +2114,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
1572 */ 2114 */
1573 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2115 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1574 2116
1575 ctx_sched_in(ctx, cpuctx, EVENT_PINNED); 2117 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1576 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2118 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1577 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 2119 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1578 2120
1579 cpuctx->task_ctx = ctx; 2121 cpuctx->task_ctx = ctx;
1580 2122
@@ -1607,8 +2149,15 @@ void __perf_event_task_sched_in(struct task_struct *task)
1607 if (likely(!ctx)) 2149 if (likely(!ctx))
1608 continue; 2150 continue;
1609 2151
1610 perf_event_context_sched_in(ctx); 2152 perf_event_context_sched_in(ctx, task);
1611 } 2153 }
2154 /*
2155 * if cgroup events exist on this CPU, then we need
2156 * to check if we have to switch in PMU state.
2157 * cgroup event are system-wide mode only
2158 */
2159 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2160 perf_cgroup_sched_in(task);
1612} 2161}
1613 2162
1614static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2163static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -1638,7 +2187,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1638 * Reduce accuracy by one bit such that @a and @b converge 2187 * Reduce accuracy by one bit such that @a and @b converge
1639 * to a similar magnitude. 2188 * to a similar magnitude.
1640 */ 2189 */
1641#define REDUCE_FLS(a, b) \ 2190#define REDUCE_FLS(a, b) \
1642do { \ 2191do { \
1643 if (a##_fls > b##_fls) { \ 2192 if (a##_fls > b##_fls) { \
1644 a >>= 1; \ 2193 a >>= 1; \
@@ -1808,7 +2357,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1808 if (ctx) 2357 if (ctx)
1809 rotate_ctx(ctx); 2358 rotate_ctx(ctx);
1810 2359
1811 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2360 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
1812 if (ctx) 2361 if (ctx)
1813 task_ctx_sched_in(ctx, EVENT_FLEXIBLE); 2362 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1814 2363
@@ -1887,7 +2436,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1887 2436
1888 raw_spin_unlock(&ctx->lock); 2437 raw_spin_unlock(&ctx->lock);
1889 2438
1890 perf_event_context_sched_in(ctx); 2439 perf_event_context_sched_in(ctx, ctx->task);
1891out: 2440out:
1892 local_irq_restore(flags); 2441 local_irq_restore(flags);
1893} 2442}
@@ -1912,8 +2461,10 @@ static void __perf_event_read(void *info)
1912 return; 2461 return;
1913 2462
1914 raw_spin_lock(&ctx->lock); 2463 raw_spin_lock(&ctx->lock);
1915 if (ctx->is_active) 2464 if (ctx->is_active) {
1916 update_context_time(ctx); 2465 update_context_time(ctx);
2466 update_cgrp_time_from_event(event);
2467 }
1917 update_event_times(event); 2468 update_event_times(event);
1918 if (event->state == PERF_EVENT_STATE_ACTIVE) 2469 if (event->state == PERF_EVENT_STATE_ACTIVE)
1919 event->pmu->read(event); 2470 event->pmu->read(event);
@@ -1944,8 +2495,10 @@ static u64 perf_event_read(struct perf_event *event)
1944 * (e.g., thread is blocked), in that case 2495 * (e.g., thread is blocked), in that case
1945 * we cannot update context time 2496 * we cannot update context time
1946 */ 2497 */
1947 if (ctx->is_active) 2498 if (ctx->is_active) {
1948 update_context_time(ctx); 2499 update_context_time(ctx);
2500 update_cgrp_time_from_event(event);
2501 }
1949 update_event_times(event); 2502 update_event_times(event);
1950 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2503 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1951 } 2504 }
@@ -2224,6 +2777,9 @@ errout:
2224 2777
2225} 2778}
2226 2779
2780/*
2781 * Returns a matching context with refcount and pincount.
2782 */
2227static struct perf_event_context * 2783static struct perf_event_context *
2228find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 2784find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2229{ 2785{
@@ -2248,6 +2804,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2248 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 2804 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2249 ctx = &cpuctx->ctx; 2805 ctx = &cpuctx->ctx;
2250 get_ctx(ctx); 2806 get_ctx(ctx);
2807 ++ctx->pin_count;
2251 2808
2252 return ctx; 2809 return ctx;
2253 } 2810 }
@@ -2261,6 +2818,7 @@ retry:
2261 ctx = perf_lock_task_context(task, ctxn, &flags); 2818 ctx = perf_lock_task_context(task, ctxn, &flags);
2262 if (ctx) { 2819 if (ctx) {
2263 unclone_ctx(ctx); 2820 unclone_ctx(ctx);
2821 ++ctx->pin_count;
2264 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2822 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2265 } 2823 }
2266 2824
@@ -2282,8 +2840,10 @@ retry:
2282 err = -ESRCH; 2840 err = -ESRCH;
2283 else if (task->perf_event_ctxp[ctxn]) 2841 else if (task->perf_event_ctxp[ctxn])
2284 err = -EAGAIN; 2842 err = -EAGAIN;
2285 else 2843 else {
2844 ++ctx->pin_count;
2286 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2845 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2846 }
2287 mutex_unlock(&task->perf_event_mutex); 2847 mutex_unlock(&task->perf_event_mutex);
2288 2848
2289 if (unlikely(err)) { 2849 if (unlikely(err)) {
@@ -2323,7 +2883,7 @@ static void free_event(struct perf_event *event)
2323 2883
2324 if (!event->parent) { 2884 if (!event->parent) {
2325 if (event->attach_state & PERF_ATTACH_TASK) 2885 if (event->attach_state & PERF_ATTACH_TASK)
2326 jump_label_dec(&perf_task_events); 2886 jump_label_dec(&perf_sched_events);
2327 if (event->attr.mmap || event->attr.mmap_data) 2887 if (event->attr.mmap || event->attr.mmap_data)
2328 atomic_dec(&nr_mmap_events); 2888 atomic_dec(&nr_mmap_events);
2329 if (event->attr.comm) 2889 if (event->attr.comm)
@@ -2332,6 +2892,10 @@ static void free_event(struct perf_event *event)
2332 atomic_dec(&nr_task_events); 2892 atomic_dec(&nr_task_events);
2333 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 2893 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2334 put_callchain_buffers(); 2894 put_callchain_buffers();
2895 if (is_cgroup_event(event)) {
2896 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2897 jump_label_dec(&perf_sched_events);
2898 }
2335 } 2899 }
2336 2900
2337 if (event->buffer) { 2901 if (event->buffer) {
@@ -2339,6 +2903,9 @@ static void free_event(struct perf_event *event)
2339 event->buffer = NULL; 2903 event->buffer = NULL;
2340 } 2904 }
2341 2905
2906 if (is_cgroup_event(event))
2907 perf_detach_cgroup(event);
2908
2342 if (event->destroy) 2909 if (event->destroy)
2343 event->destroy(event); 2910 event->destroy(event);
2344 2911
@@ -4406,26 +4973,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4406 if (unlikely(!is_sampling_event(event))) 4973 if (unlikely(!is_sampling_event(event)))
4407 return 0; 4974 return 0;
4408 4975
4409 if (!throttle) { 4976 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4410 hwc->interrupts++; 4977 if (throttle) {
4411 } else { 4978 hwc->interrupts = MAX_INTERRUPTS;
4412 if (hwc->interrupts != MAX_INTERRUPTS) { 4979 perf_log_throttle(event, 0);
4413 hwc->interrupts++;
4414 if (HZ * hwc->interrupts >
4415 (u64)sysctl_perf_event_sample_rate) {
4416 hwc->interrupts = MAX_INTERRUPTS;
4417 perf_log_throttle(event, 0);
4418 ret = 1;
4419 }
4420 } else {
4421 /*
4422 * Keep re-disabling events even though on the previous
4423 * pass we disabled it - just in case we raced with a
4424 * sched-in and the event got enabled again:
4425 */
4426 ret = 1; 4980 ret = 1;
4427 } 4981 }
4428 } 4982 } else
4983 hwc->interrupts++;
4429 4984
4430 if (event->attr.freq) { 4985 if (event->attr.freq) {
4431 u64 now = perf_clock(); 4986 u64 now = perf_clock();
@@ -5062,6 +5617,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5062 u64 period; 5617 u64 period;
5063 5618
5064 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 5619 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5620
5621 if (event->state != PERF_EVENT_STATE_ACTIVE)
5622 return HRTIMER_NORESTART;
5623
5065 event->pmu->read(event); 5624 event->pmu->read(event);
5066 5625
5067 perf_sample_data_init(&data, 0); 5626 perf_sample_data_init(&data, 0);
@@ -5088,9 +5647,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
5088 if (!is_sampling_event(event)) 5647 if (!is_sampling_event(event))
5089 return; 5648 return;
5090 5649
5091 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5092 hwc->hrtimer.function = perf_swevent_hrtimer;
5093
5094 period = local64_read(&hwc->period_left); 5650 period = local64_read(&hwc->period_left);
5095 if (period) { 5651 if (period) {
5096 if (period < 0) 5652 if (period < 0)
@@ -5117,6 +5673,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5117 } 5673 }
5118} 5674}
5119 5675
5676static void perf_swevent_init_hrtimer(struct perf_event *event)
5677{
5678 struct hw_perf_event *hwc = &event->hw;
5679
5680 if (!is_sampling_event(event))
5681 return;
5682
5683 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5684 hwc->hrtimer.function = perf_swevent_hrtimer;
5685
5686 /*
5687 * Since hrtimers have a fixed rate, we can do a static freq->period
5688 * mapping and avoid the whole period adjust feedback stuff.
5689 */
5690 if (event->attr.freq) {
5691 long freq = event->attr.sample_freq;
5692
5693 event->attr.sample_period = NSEC_PER_SEC / freq;
5694 hwc->sample_period = event->attr.sample_period;
5695 local64_set(&hwc->period_left, hwc->sample_period);
5696 event->attr.freq = 0;
5697 }
5698}
5699
5120/* 5700/*
5121 * Software event: cpu wall time clock 5701 * Software event: cpu wall time clock
5122 */ 5702 */
@@ -5169,6 +5749,8 @@ static int cpu_clock_event_init(struct perf_event *event)
5169 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5749 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5170 return -ENOENT; 5750 return -ENOENT;
5171 5751
5752 perf_swevent_init_hrtimer(event);
5753
5172 return 0; 5754 return 0;
5173} 5755}
5174 5756
@@ -5224,16 +5806,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
5224 5806
5225static void task_clock_event_read(struct perf_event *event) 5807static void task_clock_event_read(struct perf_event *event)
5226{ 5808{
5227 u64 time; 5809 u64 now = perf_clock();
5228 5810 u64 delta = now - event->ctx->timestamp;
5229 if (!in_nmi()) { 5811 u64 time = event->ctx->time + delta;
5230 update_context_time(event->ctx);
5231 time = event->ctx->time;
5232 } else {
5233 u64 now = perf_clock();
5234 u64 delta = now - event->ctx->timestamp;
5235 time = event->ctx->time + delta;
5236 }
5237 5812
5238 task_clock_event_update(event, time); 5813 task_clock_event_update(event, time);
5239} 5814}
@@ -5246,6 +5821,8 @@ static int task_clock_event_init(struct perf_event *event)
5246 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5821 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5247 return -ENOENT; 5822 return -ENOENT;
5248 5823
5824 perf_swevent_init_hrtimer(event);
5825
5249 return 0; 5826 return 0;
5250} 5827}
5251 5828
@@ -5517,17 +6094,22 @@ struct pmu *perf_init_event(struct perf_event *event)
5517{ 6094{
5518 struct pmu *pmu = NULL; 6095 struct pmu *pmu = NULL;
5519 int idx; 6096 int idx;
6097 int ret;
5520 6098
5521 idx = srcu_read_lock(&pmus_srcu); 6099 idx = srcu_read_lock(&pmus_srcu);
5522 6100
5523 rcu_read_lock(); 6101 rcu_read_lock();
5524 pmu = idr_find(&pmu_idr, event->attr.type); 6102 pmu = idr_find(&pmu_idr, event->attr.type);
5525 rcu_read_unlock(); 6103 rcu_read_unlock();
5526 if (pmu) 6104 if (pmu) {
6105 ret = pmu->event_init(event);
6106 if (ret)
6107 pmu = ERR_PTR(ret);
5527 goto unlock; 6108 goto unlock;
6109 }
5528 6110
5529 list_for_each_entry_rcu(pmu, &pmus, entry) { 6111 list_for_each_entry_rcu(pmu, &pmus, entry) {
5530 int ret = pmu->event_init(event); 6112 ret = pmu->event_init(event);
5531 if (!ret) 6113 if (!ret)
5532 goto unlock; 6114 goto unlock;
5533 6115
@@ -5653,7 +6235,7 @@ done:
5653 6235
5654 if (!event->parent) { 6236 if (!event->parent) {
5655 if (event->attach_state & PERF_ATTACH_TASK) 6237 if (event->attach_state & PERF_ATTACH_TASK)
5656 jump_label_inc(&perf_task_events); 6238 jump_label_inc(&perf_sched_events);
5657 if (event->attr.mmap || event->attr.mmap_data) 6239 if (event->attr.mmap || event->attr.mmap_data)
5658 atomic_inc(&nr_mmap_events); 6240 atomic_inc(&nr_mmap_events);
5659 if (event->attr.comm) 6241 if (event->attr.comm)
@@ -5828,7 +6410,7 @@ SYSCALL_DEFINE5(perf_event_open,
5828 int err; 6410 int err;
5829 6411
5830 /* for future expandability... */ 6412 /* for future expandability... */
5831 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) 6413 if (flags & ~PERF_FLAG_ALL)
5832 return -EINVAL; 6414 return -EINVAL;
5833 6415
5834 err = perf_copy_attr(attr_uptr, &attr); 6416 err = perf_copy_attr(attr_uptr, &attr);
@@ -5845,6 +6427,15 @@ SYSCALL_DEFINE5(perf_event_open,
5845 return -EINVAL; 6427 return -EINVAL;
5846 } 6428 }
5847 6429
6430 /*
6431 * In cgroup mode, the pid argument is used to pass the fd
6432 * opened to the cgroup directory in cgroupfs. The cpu argument
6433 * designates the cpu on which to monitor threads from that
6434 * cgroup.
6435 */
6436 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6437 return -EINVAL;
6438
5848 event_fd = get_unused_fd_flags(O_RDWR); 6439 event_fd = get_unused_fd_flags(O_RDWR);
5849 if (event_fd < 0) 6440 if (event_fd < 0)
5850 return event_fd; 6441 return event_fd;
@@ -5862,7 +6453,7 @@ SYSCALL_DEFINE5(perf_event_open,
5862 group_leader = NULL; 6453 group_leader = NULL;
5863 } 6454 }
5864 6455
5865 if (pid != -1) { 6456 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
5866 task = find_lively_task_by_vpid(pid); 6457 task = find_lively_task_by_vpid(pid);
5867 if (IS_ERR(task)) { 6458 if (IS_ERR(task)) {
5868 err = PTR_ERR(task); 6459 err = PTR_ERR(task);
@@ -5876,6 +6467,19 @@ SYSCALL_DEFINE5(perf_event_open,
5876 goto err_task; 6467 goto err_task;
5877 } 6468 }
5878 6469
6470 if (flags & PERF_FLAG_PID_CGROUP) {
6471 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6472 if (err)
6473 goto err_alloc;
6474 /*
6475 * one more event:
6476 * - that has cgroup constraint on event->cpu
6477 * - that may need work on context switch
6478 */
6479 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6480 jump_label_inc(&perf_sched_events);
6481 }
6482
5879 /* 6483 /*
5880 * Special case software events and allow them to be part of 6484 * Special case software events and allow them to be part of
5881 * any hardware group. 6485 * any hardware group.
@@ -5961,10 +6565,10 @@ SYSCALL_DEFINE5(perf_event_open,
5961 struct perf_event_context *gctx = group_leader->ctx; 6565 struct perf_event_context *gctx = group_leader->ctx;
5962 6566
5963 mutex_lock(&gctx->mutex); 6567 mutex_lock(&gctx->mutex);
5964 perf_event_remove_from_context(group_leader); 6568 perf_remove_from_context(group_leader);
5965 list_for_each_entry(sibling, &group_leader->sibling_list, 6569 list_for_each_entry(sibling, &group_leader->sibling_list,
5966 group_entry) { 6570 group_entry) {
5967 perf_event_remove_from_context(sibling); 6571 perf_remove_from_context(sibling);
5968 put_ctx(gctx); 6572 put_ctx(gctx);
5969 } 6573 }
5970 mutex_unlock(&gctx->mutex); 6574 mutex_unlock(&gctx->mutex);
@@ -5987,6 +6591,7 @@ SYSCALL_DEFINE5(perf_event_open,
5987 6591
5988 perf_install_in_context(ctx, event, cpu); 6592 perf_install_in_context(ctx, event, cpu);
5989 ++ctx->generation; 6593 ++ctx->generation;
6594 perf_unpin_context(ctx);
5990 mutex_unlock(&ctx->mutex); 6595 mutex_unlock(&ctx->mutex);
5991 6596
5992 event->owner = current; 6597 event->owner = current;
@@ -6012,6 +6617,7 @@ SYSCALL_DEFINE5(perf_event_open,
6012 return event_fd; 6617 return event_fd;
6013 6618
6014err_context: 6619err_context:
6620 perf_unpin_context(ctx);
6015 put_ctx(ctx); 6621 put_ctx(ctx);
6016err_alloc: 6622err_alloc:
6017 free_event(event); 6623 free_event(event);
@@ -6062,6 +6668,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6062 mutex_lock(&ctx->mutex); 6668 mutex_lock(&ctx->mutex);
6063 perf_install_in_context(ctx, event, cpu); 6669 perf_install_in_context(ctx, event, cpu);
6064 ++ctx->generation; 6670 ++ctx->generation;
6671 perf_unpin_context(ctx);
6065 mutex_unlock(&ctx->mutex); 6672 mutex_unlock(&ctx->mutex);
6066 6673
6067 return event; 6674 return event;
@@ -6115,7 +6722,7 @@ __perf_event_exit_task(struct perf_event *child_event,
6115{ 6722{
6116 struct perf_event *parent_event; 6723 struct perf_event *parent_event;
6117 6724
6118 perf_event_remove_from_context(child_event); 6725 perf_remove_from_context(child_event);
6119 6726
6120 parent_event = child_event->parent; 6727 parent_event = child_event->parent;
6121 /* 6728 /*
@@ -6422,7 +7029,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
6422 return 0; 7029 return 0;
6423 } 7030 }
6424 7031
6425 child_ctx = child->perf_event_ctxp[ctxn]; 7032 child_ctx = child->perf_event_ctxp[ctxn];
6426 if (!child_ctx) { 7033 if (!child_ctx) {
6427 /* 7034 /*
6428 * This is executed from the parent task context, so 7035 * This is executed from the parent task context, so
@@ -6537,6 +7144,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6537 mutex_unlock(&parent_ctx->mutex); 7144 mutex_unlock(&parent_ctx->mutex);
6538 7145
6539 perf_unpin_context(parent_ctx); 7146 perf_unpin_context(parent_ctx);
7147 put_ctx(parent_ctx);
6540 7148
6541 return ret; 7149 return ret;
6542} 7150}
@@ -6606,9 +7214,9 @@ static void __perf_event_exit_context(void *__info)
6606 perf_pmu_rotate_stop(ctx->pmu); 7214 perf_pmu_rotate_stop(ctx->pmu);
6607 7215
6608 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 7216 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
6609 __perf_event_remove_from_context(event); 7217 __perf_remove_from_context(event);
6610 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 7218 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
6611 __perf_event_remove_from_context(event); 7219 __perf_remove_from_context(event);
6612} 7220}
6613 7221
6614static void perf_event_exit_cpu_context(int cpu) 7222static void perf_event_exit_cpu_context(int cpu)
@@ -6732,3 +7340,83 @@ unlock:
6732 return ret; 7340 return ret;
6733} 7341}
6734device_initcall(perf_event_sysfs_init); 7342device_initcall(perf_event_sysfs_init);
7343
7344#ifdef CONFIG_CGROUP_PERF
7345static struct cgroup_subsys_state *perf_cgroup_create(
7346 struct cgroup_subsys *ss, struct cgroup *cont)
7347{
7348 struct perf_cgroup *jc;
7349
7350 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
7351 if (!jc)
7352 return ERR_PTR(-ENOMEM);
7353
7354 jc->info = alloc_percpu(struct perf_cgroup_info);
7355 if (!jc->info) {
7356 kfree(jc);
7357 return ERR_PTR(-ENOMEM);
7358 }
7359
7360 return &jc->css;
7361}
7362
7363static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7364 struct cgroup *cont)
7365{
7366 struct perf_cgroup *jc;
7367 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7368 struct perf_cgroup, css);
7369 free_percpu(jc->info);
7370 kfree(jc);
7371}
7372
7373static int __perf_cgroup_move(void *info)
7374{
7375 struct task_struct *task = info;
7376 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7377 return 0;
7378}
7379
7380static void perf_cgroup_move(struct task_struct *task)
7381{
7382 task_function_call(task, __perf_cgroup_move, task);
7383}
7384
7385static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7386 struct cgroup *old_cgrp, struct task_struct *task,
7387 bool threadgroup)
7388{
7389 perf_cgroup_move(task);
7390 if (threadgroup) {
7391 struct task_struct *c;
7392 rcu_read_lock();
7393 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7394 perf_cgroup_move(c);
7395 }
7396 rcu_read_unlock();
7397 }
7398}
7399
7400static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7401 struct cgroup *old_cgrp, struct task_struct *task)
7402{
7403 /*
7404 * cgroup_exit() is called in the copy_process() failure path.
7405 * Ignore this case since the task hasn't ran yet, this avoids
7406 * trying to poke a half freed task state from generic code.
7407 */
7408 if (!(task->flags & PF_EXITING))
7409 return;
7410
7411 perf_cgroup_move(task);
7412}
7413
7414struct cgroup_subsys perf_subsys = {
7415 .name = "perf_event",
7416 .subsys_id = perf_subsys_id,
7417 .create = perf_cgroup_create,
7418 .destroy = perf_cgroup_destroy,
7419 .exit = perf_cgroup_exit,
7420 .attach = perf_cgroup_attach,
7421};
7422#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 05bb7173850e..67fea9d25d55 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p)
176 return p->utime; 176 return p->utime;
177} 177}
178 178
179int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 179static int
180posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
180{ 181{
181 int error = check_clock(which_clock); 182 int error = check_clock(which_clock);
182 if (!error) { 183 if (!error) {
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
194 return error; 195 return error;
195} 196}
196 197
197int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) 198static int
199posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
198{ 200{
199 /* 201 /*
200 * You can never reset a CPU clock, but we check for other errors 202 * You can never reset a CPU clock, but we check for other errors
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
317} 319}
318 320
319 321
320int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 322static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
321{ 323{
322 const pid_t pid = CPUCLOCK_PID(which_clock); 324 const pid_t pid = CPUCLOCK_PID(which_clock);
323 int error = -EINVAL; 325 int error = -EINVAL;
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
379 * This is called from sys_timer_create() and do_cpu_nanosleep() with the 381 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
380 * new timer already all-zeros initialized. 382 * new timer already all-zeros initialized.
381 */ 383 */
382int posix_cpu_timer_create(struct k_itimer *new_timer) 384static int posix_cpu_timer_create(struct k_itimer *new_timer)
383{ 385{
384 int ret = 0; 386 int ret = 0;
385 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); 387 const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
425 * If we return TIMER_RETRY, it's necessary to release the timer's lock 427 * If we return TIMER_RETRY, it's necessary to release the timer's lock
426 * and try again. (This happens when the timer is in the middle of firing.) 428 * and try again. (This happens when the timer is in the middle of firing.)
427 */ 429 */
428int posix_cpu_timer_del(struct k_itimer *timer) 430static int posix_cpu_timer_del(struct k_itimer *timer)
429{ 431{
430 struct task_struct *p = timer->it.cpu.task; 432 struct task_struct *p = timer->it.cpu.task;
431 int ret = 0; 433 int ret = 0;
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
665 * If we return TIMER_RETRY, it's necessary to release the timer's lock 667 * If we return TIMER_RETRY, it's necessary to release the timer's lock
666 * and try again. (This happens when the timer is in the middle of firing.) 668 * and try again. (This happens when the timer is in the middle of firing.)
667 */ 669 */
668int posix_cpu_timer_set(struct k_itimer *timer, int flags, 670static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
669 struct itimerspec *new, struct itimerspec *old) 671 struct itimerspec *new, struct itimerspec *old)
670{ 672{
671 struct task_struct *p = timer->it.cpu.task; 673 struct task_struct *p = timer->it.cpu.task;
672 union cpu_time_count old_expires, new_expires, old_incr, val; 674 union cpu_time_count old_expires, new_expires, old_incr, val;
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
820 return ret; 822 return ret;
821} 823}
822 824
823void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) 825static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
824{ 826{
825 union cpu_time_count now; 827 union cpu_time_count now;
826 struct task_struct *p = timer->it.cpu.task; 828 struct task_struct *p = timer->it.cpu.task;
@@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1481 return error; 1483 return error;
1482} 1484}
1483 1485
1484int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1486static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
1485 struct timespec *rqtp, struct timespec __user *rmtp) 1487
1488static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1489 struct timespec *rqtp, struct timespec __user *rmtp)
1486{ 1490{
1487 struct restart_block *restart_block = 1491 struct restart_block *restart_block =
1488 &current_thread_info()->restart_block; 1492 &current_thread_info()->restart_block;
1489 struct itimerspec it; 1493 struct itimerspec it;
1490 int error; 1494 int error;
1491 1495
@@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1501 1505
1502 if (error == -ERESTART_RESTARTBLOCK) { 1506 if (error == -ERESTART_RESTARTBLOCK) {
1503 1507
1504 if (flags & TIMER_ABSTIME) 1508 if (flags & TIMER_ABSTIME)
1505 return -ERESTARTNOHAND; 1509 return -ERESTARTNOHAND;
1506 /* 1510 /*
1507 * Report back to the user the time still remaining. 1511 * Report back to the user the time still remaining.
1508 */ 1512 */
1509 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1513 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1510 return -EFAULT; 1514 return -EFAULT;
1511 1515
1512 restart_block->fn = posix_cpu_nsleep_restart; 1516 restart_block->fn = posix_cpu_nsleep_restart;
1513 restart_block->arg0 = which_clock; 1517 restart_block->nanosleep.index = which_clock;
1514 restart_block->arg1 = (unsigned long) rmtp; 1518 restart_block->nanosleep.rmtp = rmtp;
1515 restart_block->arg2 = rqtp->tv_sec; 1519 restart_block->nanosleep.expires = timespec_to_ns(rqtp);
1516 restart_block->arg3 = rqtp->tv_nsec;
1517 } 1520 }
1518 return error; 1521 return error;
1519} 1522}
1520 1523
1521long posix_cpu_nsleep_restart(struct restart_block *restart_block) 1524static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1522{ 1525{
1523 clockid_t which_clock = restart_block->arg0; 1526 clockid_t which_clock = restart_block->nanosleep.index;
1524 struct timespec __user *rmtp;
1525 struct timespec t; 1527 struct timespec t;
1526 struct itimerspec it; 1528 struct itimerspec it;
1527 int error; 1529 int error;
1528 1530
1529 rmtp = (struct timespec __user *) restart_block->arg1; 1531 t = ns_to_timespec(restart_block->nanosleep.expires);
1530 t.tv_sec = restart_block->arg2;
1531 t.tv_nsec = restart_block->arg3;
1532 1532
1533 restart_block->fn = do_no_restart_syscall;
1534 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); 1533 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1535 1534
1536 if (error == -ERESTART_RESTARTBLOCK) { 1535 if (error == -ERESTART_RESTARTBLOCK) {
1536 struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
1537 /* 1537 /*
1538 * Report back to the user the time still remaining. 1538 * Report back to the user the time still remaining.
1539 */ 1539 */
1540 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) 1540 if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1541 return -EFAULT; 1541 return -EFAULT;
1542 1542
1543 restart_block->fn = posix_cpu_nsleep_restart; 1543 restart_block->nanosleep.expires = timespec_to_ns(&t);
1544 restart_block->arg0 = which_clock;
1545 restart_block->arg1 = (unsigned long) rmtp;
1546 restart_block->arg2 = t.tv_sec;
1547 restart_block->arg3 = t.tv_nsec;
1548 } 1544 }
1549 return error; 1545 return error;
1550 1546
1551} 1547}
1552 1548
1553
1554#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) 1549#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
1555#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) 1550#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
1556 1551
@@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
1594 timer->it_clock = THREAD_CLOCK; 1589 timer->it_clock = THREAD_CLOCK;
1595 return posix_cpu_timer_create(timer); 1590 return posix_cpu_timer_create(timer);
1596} 1591}
1597static int thread_cpu_nsleep(const clockid_t which_clock, int flags, 1592
1598 struct timespec *rqtp, struct timespec __user *rmtp) 1593struct k_clock clock_posix_cpu = {
1599{ 1594 .clock_getres = posix_cpu_clock_getres,
1600 return -EINVAL; 1595 .clock_set = posix_cpu_clock_set,
1601} 1596 .clock_get = posix_cpu_clock_get,
1602static long thread_cpu_nsleep_restart(struct restart_block *restart_block) 1597 .timer_create = posix_cpu_timer_create,
1603{ 1598 .nsleep = posix_cpu_nsleep,
1604 return -EINVAL; 1599 .nsleep_restart = posix_cpu_nsleep_restart,
1605} 1600 .timer_set = posix_cpu_timer_set,
1601 .timer_del = posix_cpu_timer_del,
1602 .timer_get = posix_cpu_timer_get,
1603};
1606 1604
1607static __init int init_posix_cpu_timers(void) 1605static __init int init_posix_cpu_timers(void)
1608{ 1606{
1609 struct k_clock process = { 1607 struct k_clock process = {
1610 .clock_getres = process_cpu_clock_getres, 1608 .clock_getres = process_cpu_clock_getres,
1611 .clock_get = process_cpu_clock_get, 1609 .clock_get = process_cpu_clock_get,
1612 .clock_set = do_posix_clock_nosettime, 1610 .timer_create = process_cpu_timer_create,
1613 .timer_create = process_cpu_timer_create, 1611 .nsleep = process_cpu_nsleep,
1614 .nsleep = process_cpu_nsleep, 1612 .nsleep_restart = process_cpu_nsleep_restart,
1615 .nsleep_restart = process_cpu_nsleep_restart,
1616 }; 1613 };
1617 struct k_clock thread = { 1614 struct k_clock thread = {
1618 .clock_getres = thread_cpu_clock_getres, 1615 .clock_getres = thread_cpu_clock_getres,
1619 .clock_get = thread_cpu_clock_get, 1616 .clock_get = thread_cpu_clock_get,
1620 .clock_set = do_posix_clock_nosettime, 1617 .timer_create = thread_cpu_timer_create,
1621 .timer_create = thread_cpu_timer_create,
1622 .nsleep = thread_cpu_nsleep,
1623 .nsleep_restart = thread_cpu_nsleep_restart,
1624 }; 1618 };
1625 struct timespec ts; 1619 struct timespec ts;
1626 1620
1627 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1621 posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1628 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); 1622 posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1629 1623
1630 cputime_to_timespec(cputime_one_jiffy, &ts); 1624 cputime_to_timespec(cputime_one_jiffy, &ts);
1631 onecputick = ts.tv_nsec; 1625 onecputick = ts.tv_nsec;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 93bd2eb2bc53..4c0124919f9a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -41,6 +41,7 @@
41#include <linux/init.h> 41#include <linux/init.h>
42#include <linux/compiler.h> 42#include <linux/compiler.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/posix-clock.h>
44#include <linux/posix-timers.h> 45#include <linux/posix-timers.h>
45#include <linux/syscalls.h> 46#include <linux/syscalls.h>
46#include <linux/wait.h> 47#include <linux/wait.h>
@@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock);
81#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" 82#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
82#endif 83#endif
83 84
85/*
86 * parisc wants ENOTSUP instead of EOPNOTSUPP
87 */
88#ifndef ENOTSUP
89# define ENANOSLEEP_NOTSUP EOPNOTSUPP
90#else
91# define ENANOSLEEP_NOTSUP ENOTSUP
92#endif
84 93
85/* 94/*
86 * The timer ID is turned into a timer address by idr_find(). 95 * The timer ID is turned into a timer address by idr_find().
@@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock);
94/* 103/*
95 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us 104 * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
96 * to implement others. This structure defines the various 105 * to implement others. This structure defines the various
97 * clocks and allows the possibility of adding others. We 106 * clocks.
98 * provide an interface to add clocks to the table and expect
99 * the "arch" code to add at least one clock that is high
100 * resolution. Here we define the standard CLOCK_REALTIME as a
101 * 1/HZ resolution clock.
102 * 107 *
103 * RESOLUTION: Clock resolution is used to round up timer and interval 108 * RESOLUTION: Clock resolution is used to round up timer and interval
104 * times, NOT to report clock times, which are reported with as 109 * times, NOT to report clock times, which are reported with as
@@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock);
108 * necessary code is written. The standard says we should say 113 * necessary code is written. The standard says we should say
109 * something about this issue in the documentation... 114 * something about this issue in the documentation...
110 * 115 *
111 * FUNCTIONS: The CLOCKs structure defines possible functions to handle 116 * FUNCTIONS: The CLOCKs structure defines possible functions to
112 * various clock functions. For clocks that use the standard 117 * handle various clock functions.
113 * system timer code these entries should be NULL. This will
114 * allow dispatch without the overhead of indirect function
115 * calls. CLOCKS that depend on other sources (e.g. WWV or GPS)
116 * must supply functions here, even if the function just returns
117 * ENOSYS. The standard POSIX timer management code assumes the
118 * following: 1.) The k_itimer struct (sched.h) is used for the
119 * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
120 * fields are not modified by timer code.
121 * 118 *
122 * At this time all functions EXCEPT clock_nanosleep can be 119 * The standard POSIX timer management code assumes the
123 * redirected by the CLOCKS structure. Clock_nanosleep is in 120 * following: 1.) The k_itimer struct (sched.h) is used for
124 * there, but the code ignores it. 121 * the timer. 2.) The list, it_lock, it_clock, it_id and
122 * it_pid fields are not modified by timer code.
125 * 123 *
126 * Permissions: It is assumed that the clock_settime() function defined 124 * Permissions: It is assumed that the clock_settime() function defined
127 * for each clock will take care of permission checks. Some 125 * for each clock will take care of permission checks. Some
@@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
138 */ 136 */
139static int common_nsleep(const clockid_t, int flags, struct timespec *t, 137static int common_nsleep(const clockid_t, int flags, struct timespec *t,
140 struct timespec __user *rmtp); 138 struct timespec __user *rmtp);
139static int common_timer_create(struct k_itimer *new_timer);
141static void common_timer_get(struct k_itimer *, struct itimerspec *); 140static void common_timer_get(struct k_itimer *, struct itimerspec *);
142static int common_timer_set(struct k_itimer *, int, 141static int common_timer_set(struct k_itimer *, int,
143 struct itimerspec *, struct itimerspec *); 142 struct itimerspec *, struct itimerspec *);
@@ -158,76 +157,24 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
158 spin_unlock_irqrestore(&timr->it_lock, flags); 157 spin_unlock_irqrestore(&timr->it_lock, flags);
159} 158}
160 159
161/* 160/* Get clock_realtime */
162 * Call the k_clock hook function if non-null, or the default function. 161static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
163 */
164#define CLOCK_DISPATCH(clock, call, arglist) \
165 ((clock) < 0 ? posix_cpu_##call arglist : \
166 (posix_clocks[clock].call != NULL \
167 ? (*posix_clocks[clock].call) arglist : common_##call arglist))
168
169/*
170 * Default clock hook functions when the struct k_clock passed
171 * to register_posix_clock leaves a function pointer null.
172 *
173 * The function common_CALL is the default implementation for
174 * the function pointer CALL in struct k_clock.
175 */
176
177static inline int common_clock_getres(const clockid_t which_clock,
178 struct timespec *tp)
179{
180 tp->tv_sec = 0;
181 tp->tv_nsec = posix_clocks[which_clock].res;
182 return 0;
183}
184
185/*
186 * Get real time for posix timers
187 */
188static int common_clock_get(clockid_t which_clock, struct timespec *tp)
189{ 162{
190 ktime_get_real_ts(tp); 163 ktime_get_real_ts(tp);
191 return 0; 164 return 0;
192} 165}
193 166
194static inline int common_clock_set(const clockid_t which_clock, 167/* Set clock_realtime */
195 struct timespec *tp) 168static int posix_clock_realtime_set(const clockid_t which_clock,
169 const struct timespec *tp)
196{ 170{
197 return do_sys_settimeofday(tp, NULL); 171 return do_sys_settimeofday(tp, NULL);
198} 172}
199 173
200static int common_timer_create(struct k_itimer *new_timer) 174static int posix_clock_realtime_adj(const clockid_t which_clock,
201{ 175 struct timex *t)
202 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
203 return 0;
204}
205
206static int no_timer_create(struct k_itimer *new_timer)
207{
208 return -EOPNOTSUPP;
209}
210
211static int no_nsleep(const clockid_t which_clock, int flags,
212 struct timespec *tsave, struct timespec __user *rmtp)
213{
214 return -EOPNOTSUPP;
215}
216
217/*
218 * Return nonzero if we know a priori this clockid_t value is bogus.
219 */
220static inline int invalid_clockid(const clockid_t which_clock)
221{ 176{
222 if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ 177 return do_adjtimex(t);
223 return 0;
224 if ((unsigned) which_clock >= MAX_CLOCKS)
225 return 1;
226 if (posix_clocks[which_clock].clock_getres != NULL)
227 return 0;
228 if (posix_clocks[which_clock].res != 0)
229 return 0;
230 return 1;
231} 178}
232 179
233/* 180/*
@@ -240,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
240} 187}
241 188
242/* 189/*
243 * Get monotonic time for posix timers 190 * Get monotonic-raw time for posix timers
244 */ 191 */
245static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) 192static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
246{ 193{
@@ -267,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
267 *tp = ktime_to_timespec(KTIME_LOW_RES); 214 *tp = ktime_to_timespec(KTIME_LOW_RES);
268 return 0; 215 return 0;
269} 216}
217
218static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
219{
220 get_monotonic_boottime(tp);
221 return 0;
222}
223
224
270/* 225/*
271 * Initialize everything, well, just everything in Posix clocks/timers ;) 226 * Initialize everything, well, just everything in Posix clocks/timers ;)
272 */ 227 */
273static __init int init_posix_timers(void) 228static __init int init_posix_timers(void)
274{ 229{
275 struct k_clock clock_realtime = { 230 struct k_clock clock_realtime = {
276 .clock_getres = hrtimer_get_res, 231 .clock_getres = hrtimer_get_res,
232 .clock_get = posix_clock_realtime_get,
233 .clock_set = posix_clock_realtime_set,
234 .clock_adj = posix_clock_realtime_adj,
235 .nsleep = common_nsleep,
236 .nsleep_restart = hrtimer_nanosleep_restart,
237 .timer_create = common_timer_create,
238 .timer_set = common_timer_set,
239 .timer_get = common_timer_get,
240 .timer_del = common_timer_del,
277 }; 241 };
278 struct k_clock clock_monotonic = { 242 struct k_clock clock_monotonic = {
279 .clock_getres = hrtimer_get_res, 243 .clock_getres = hrtimer_get_res,
280 .clock_get = posix_ktime_get_ts, 244 .clock_get = posix_ktime_get_ts,
281 .clock_set = do_posix_clock_nosettime, 245 .nsleep = common_nsleep,
246 .nsleep_restart = hrtimer_nanosleep_restart,
247 .timer_create = common_timer_create,
248 .timer_set = common_timer_set,
249 .timer_get = common_timer_get,
250 .timer_del = common_timer_del,
282 }; 251 };
283 struct k_clock clock_monotonic_raw = { 252 struct k_clock clock_monotonic_raw = {
284 .clock_getres = hrtimer_get_res, 253 .clock_getres = hrtimer_get_res,
285 .clock_get = posix_get_monotonic_raw, 254 .clock_get = posix_get_monotonic_raw,
286 .clock_set = do_posix_clock_nosettime,
287 .timer_create = no_timer_create,
288 .nsleep = no_nsleep,
289 }; 255 };
290 struct k_clock clock_realtime_coarse = { 256 struct k_clock clock_realtime_coarse = {
291 .clock_getres = posix_get_coarse_res, 257 .clock_getres = posix_get_coarse_res,
292 .clock_get = posix_get_realtime_coarse, 258 .clock_get = posix_get_realtime_coarse,
293 .clock_set = do_posix_clock_nosettime,
294 .timer_create = no_timer_create,
295 .nsleep = no_nsleep,
296 }; 259 };
297 struct k_clock clock_monotonic_coarse = { 260 struct k_clock clock_monotonic_coarse = {
298 .clock_getres = posix_get_coarse_res, 261 .clock_getres = posix_get_coarse_res,
299 .clock_get = posix_get_monotonic_coarse, 262 .clock_get = posix_get_monotonic_coarse,
300 .clock_set = do_posix_clock_nosettime, 263 };
301 .timer_create = no_timer_create, 264 struct k_clock clock_boottime = {
302 .nsleep = no_nsleep, 265 .clock_getres = hrtimer_get_res,
266 .clock_get = posix_get_boottime,
267 .nsleep = common_nsleep,
268 .nsleep_restart = hrtimer_nanosleep_restart,
269 .timer_create = common_timer_create,
270 .timer_set = common_timer_set,
271 .timer_get = common_timer_get,
272 .timer_del = common_timer_del,
303 }; 273 };
304 274
305 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 275 posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
306 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 276 posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
307 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 277 posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
308 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); 278 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
309 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); 279 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
280 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
310 281
311 posix_timers_cache = kmem_cache_create("posix_timers_cache", 282 posix_timers_cache = kmem_cache_create("posix_timers_cache",
312 sizeof (struct k_itimer), 0, SLAB_PANIC, 283 sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -482,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event)
482 return task_pid(rtn); 453 return task_pid(rtn);
483} 454}
484 455
485void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) 456void posix_timers_register_clock(const clockid_t clock_id,
457 struct k_clock *new_clock)
486{ 458{
487 if ((unsigned) clock_id >= MAX_CLOCKS) { 459 if ((unsigned) clock_id >= MAX_CLOCKS) {
488 printk("POSIX clock register failed for clock_id %d\n", 460 printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
461 clock_id);
462 return;
463 }
464
465 if (!new_clock->clock_get) {
466 printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
467 clock_id);
468 return;
469 }
470 if (!new_clock->clock_getres) {
471 printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
489 clock_id); 472 clock_id);
490 return; 473 return;
491 } 474 }
492 475
493 posix_clocks[clock_id] = *new_clock; 476 posix_clocks[clock_id] = *new_clock;
494} 477}
495EXPORT_SYMBOL_GPL(register_posix_clock); 478EXPORT_SYMBOL_GPL(posix_timers_register_clock);
496 479
497static struct k_itimer * alloc_posix_timer(void) 480static struct k_itimer * alloc_posix_timer(void)
498{ 481{
@@ -523,19 +506,39 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
523 kmem_cache_free(posix_timers_cache, tmr); 506 kmem_cache_free(posix_timers_cache, tmr);
524} 507}
525 508
509static struct k_clock *clockid_to_kclock(const clockid_t id)
510{
511 if (id < 0)
512 return (id & CLOCKFD_MASK) == CLOCKFD ?
513 &clock_posix_dynamic : &clock_posix_cpu;
514
515 if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
516 return NULL;
517 return &posix_clocks[id];
518}
519
520static int common_timer_create(struct k_itimer *new_timer)
521{
522 hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
523 return 0;
524}
525
526/* Create a POSIX.1b interval timer. */ 526/* Create a POSIX.1b interval timer. */
527 527
528SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, 528SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
529 struct sigevent __user *, timer_event_spec, 529 struct sigevent __user *, timer_event_spec,
530 timer_t __user *, created_timer_id) 530 timer_t __user *, created_timer_id)
531{ 531{
532 struct k_clock *kc = clockid_to_kclock(which_clock);
532 struct k_itimer *new_timer; 533 struct k_itimer *new_timer;
533 int error, new_timer_id; 534 int error, new_timer_id;
534 sigevent_t event; 535 sigevent_t event;
535 int it_id_set = IT_ID_NOT_SET; 536 int it_id_set = IT_ID_NOT_SET;
536 537
537 if (invalid_clockid(which_clock)) 538 if (!kc)
538 return -EINVAL; 539 return -EINVAL;
540 if (!kc->timer_create)
541 return -EOPNOTSUPP;
539 542
540 new_timer = alloc_posix_timer(); 543 new_timer = alloc_posix_timer();
541 if (unlikely(!new_timer)) 544 if (unlikely(!new_timer))
@@ -597,7 +600,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
597 goto out; 600 goto out;
598 } 601 }
599 602
600 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); 603 error = kc->timer_create(new_timer);
601 if (error) 604 if (error)
602 goto out; 605 goto out;
603 606
@@ -607,7 +610,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
607 spin_unlock_irq(&current->sighand->siglock); 610 spin_unlock_irq(&current->sighand->siglock);
608 611
609 return 0; 612 return 0;
610 /* 613 /*
611 * In the case of the timer belonging to another task, after 614 * In the case of the timer belonging to another task, after
612 * the task is unlocked, the timer is owned by the other task 615 * the task is unlocked, the timer is owned by the other task
613 * and may cease to exist at any time. Don't use or modify 616 * and may cease to exist at any time. Don't use or modify
@@ -709,22 +712,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
709SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, 712SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
710 struct itimerspec __user *, setting) 713 struct itimerspec __user *, setting)
711{ 714{
712 struct k_itimer *timr;
713 struct itimerspec cur_setting; 715 struct itimerspec cur_setting;
716 struct k_itimer *timr;
717 struct k_clock *kc;
714 unsigned long flags; 718 unsigned long flags;
719 int ret = 0;
715 720
716 timr = lock_timer(timer_id, &flags); 721 timr = lock_timer(timer_id, &flags);
717 if (!timr) 722 if (!timr)
718 return -EINVAL; 723 return -EINVAL;
719 724
720 CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); 725 kc = clockid_to_kclock(timr->it_clock);
726 if (WARN_ON_ONCE(!kc || !kc->timer_get))
727 ret = -EINVAL;
728 else
729 kc->timer_get(timr, &cur_setting);
721 730
722 unlock_timer(timr, flags); 731 unlock_timer(timr, flags);
723 732
724 if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) 733 if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
725 return -EFAULT; 734 return -EFAULT;
726 735
727 return 0; 736 return ret;
728} 737}
729 738
730/* 739/*
@@ -813,6 +822,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
813 int error = 0; 822 int error = 0;
814 unsigned long flag; 823 unsigned long flag;
815 struct itimerspec *rtn = old_setting ? &old_spec : NULL; 824 struct itimerspec *rtn = old_setting ? &old_spec : NULL;
825 struct k_clock *kc;
816 826
817 if (!new_setting) 827 if (!new_setting)
818 return -EINVAL; 828 return -EINVAL;
@@ -828,8 +838,11 @@ retry:
828 if (!timr) 838 if (!timr)
829 return -EINVAL; 839 return -EINVAL;
830 840
831 error = CLOCK_DISPATCH(timr->it_clock, timer_set, 841 kc = clockid_to_kclock(timr->it_clock);
832 (timr, flags, &new_spec, rtn)); 842 if (WARN_ON_ONCE(!kc || !kc->timer_set))
843 error = -EINVAL;
844 else
845 error = kc->timer_set(timr, flags, &new_spec, rtn);
833 846
834 unlock_timer(timr, flag); 847 unlock_timer(timr, flag);
835 if (error == TIMER_RETRY) { 848 if (error == TIMER_RETRY) {
@@ -844,7 +857,7 @@ retry:
844 return error; 857 return error;
845} 858}
846 859
847static inline int common_timer_del(struct k_itimer *timer) 860static int common_timer_del(struct k_itimer *timer)
848{ 861{
849 timer->it.real.interval.tv64 = 0; 862 timer->it.real.interval.tv64 = 0;
850 863
@@ -855,7 +868,11 @@ static inline int common_timer_del(struct k_itimer *timer)
855 868
856static inline int timer_delete_hook(struct k_itimer *timer) 869static inline int timer_delete_hook(struct k_itimer *timer)
857{ 870{
858 return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); 871 struct k_clock *kc = clockid_to_kclock(timer->it_clock);
872
873 if (WARN_ON_ONCE(!kc || !kc->timer_del))
874 return -EINVAL;
875 return kc->timer_del(timer);
859} 876}
860 877
861/* Delete a POSIX.1b interval timer. */ 878/* Delete a POSIX.1b interval timer. */
@@ -927,69 +944,76 @@ void exit_itimers(struct signal_struct *sig)
927 } 944 }
928} 945}
929 946
930/* Not available / possible... functions */
931int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
932{
933 return -EINVAL;
934}
935EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
936
937int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
938 struct timespec *t, struct timespec __user *r)
939{
940#ifndef ENOTSUP
941 return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
942#else /* parisc does define it separately. */
943 return -ENOTSUP;
944#endif
945}
946EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
947
948SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, 947SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
949 const struct timespec __user *, tp) 948 const struct timespec __user *, tp)
950{ 949{
950 struct k_clock *kc = clockid_to_kclock(which_clock);
951 struct timespec new_tp; 951 struct timespec new_tp;
952 952
953 if (invalid_clockid(which_clock)) 953 if (!kc || !kc->clock_set)
954 return -EINVAL; 954 return -EINVAL;
955
955 if (copy_from_user(&new_tp, tp, sizeof (*tp))) 956 if (copy_from_user(&new_tp, tp, sizeof (*tp)))
956 return -EFAULT; 957 return -EFAULT;
957 958
958 return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); 959 return kc->clock_set(which_clock, &new_tp);
959} 960}
960 961
961SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, 962SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
962 struct timespec __user *,tp) 963 struct timespec __user *,tp)
963{ 964{
965 struct k_clock *kc = clockid_to_kclock(which_clock);
964 struct timespec kernel_tp; 966 struct timespec kernel_tp;
965 int error; 967 int error;
966 968
967 if (invalid_clockid(which_clock)) 969 if (!kc)
968 return -EINVAL; 970 return -EINVAL;
969 error = CLOCK_DISPATCH(which_clock, clock_get, 971
970 (which_clock, &kernel_tp)); 972 error = kc->clock_get(which_clock, &kernel_tp);
973
971 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) 974 if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
972 error = -EFAULT; 975 error = -EFAULT;
973 976
974 return error; 977 return error;
978}
979
980SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
981 struct timex __user *, utx)
982{
983 struct k_clock *kc = clockid_to_kclock(which_clock);
984 struct timex ktx;
985 int err;
986
987 if (!kc)
988 return -EINVAL;
989 if (!kc->clock_adj)
990 return -EOPNOTSUPP;
991
992 if (copy_from_user(&ktx, utx, sizeof(ktx)))
993 return -EFAULT;
994
995 err = kc->clock_adj(which_clock, &ktx);
996
997 if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
998 return -EFAULT;
975 999
1000 return err;
976} 1001}
977 1002
978SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, 1003SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
979 struct timespec __user *, tp) 1004 struct timespec __user *, tp)
980{ 1005{
1006 struct k_clock *kc = clockid_to_kclock(which_clock);
981 struct timespec rtn_tp; 1007 struct timespec rtn_tp;
982 int error; 1008 int error;
983 1009
984 if (invalid_clockid(which_clock)) 1010 if (!kc)
985 return -EINVAL; 1011 return -EINVAL;
986 1012
987 error = CLOCK_DISPATCH(which_clock, clock_getres, 1013 error = kc->clock_getres(which_clock, &rtn_tp);
988 (which_clock, &rtn_tp));
989 1014
990 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { 1015 if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
991 error = -EFAULT; 1016 error = -EFAULT;
992 }
993 1017
994 return error; 1018 return error;
995} 1019}
@@ -1009,10 +1033,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1009 const struct timespec __user *, rqtp, 1033 const struct timespec __user *, rqtp,
1010 struct timespec __user *, rmtp) 1034 struct timespec __user *, rmtp)
1011{ 1035{
1036 struct k_clock *kc = clockid_to_kclock(which_clock);
1012 struct timespec t; 1037 struct timespec t;
1013 1038
1014 if (invalid_clockid(which_clock)) 1039 if (!kc)
1015 return -EINVAL; 1040 return -EINVAL;
1041 if (!kc->nsleep)
1042 return -ENANOSLEEP_NOTSUP;
1016 1043
1017 if (copy_from_user(&t, rqtp, sizeof (struct timespec))) 1044 if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
1018 return -EFAULT; 1045 return -EFAULT;
@@ -1020,27 +1047,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
1020 if (!timespec_valid(&t)) 1047 if (!timespec_valid(&t))
1021 return -EINVAL; 1048 return -EINVAL;
1022 1049
1023 return CLOCK_DISPATCH(which_clock, nsleep, 1050 return kc->nsleep(which_clock, flags, &t, rmtp);
1024 (which_clock, flags, &t, rmtp));
1025}
1026
1027/*
1028 * nanosleep_restart for monotonic and realtime clocks
1029 */
1030static int common_nsleep_restart(struct restart_block *restart_block)
1031{
1032 return hrtimer_nanosleep_restart(restart_block);
1033} 1051}
1034 1052
1035/* 1053/*
1036 * This will restart clock_nanosleep. This is required only by 1054 * This will restart clock_nanosleep. This is required only by
1037 * compat_clock_nanosleep_restart for now. 1055 * compat_clock_nanosleep_restart for now.
1038 */ 1056 */
1039long 1057long clock_nanosleep_restart(struct restart_block *restart_block)
1040clock_nanosleep_restart(struct restart_block *restart_block)
1041{ 1058{
1042 clockid_t which_clock = restart_block->arg0; 1059 clockid_t which_clock = restart_block->nanosleep.index;
1060 struct k_clock *kc = clockid_to_kclock(which_clock);
1061
1062 if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
1063 return -EINVAL;
1043 1064
1044 return CLOCK_DISPATCH(which_clock, nsleep_restart, 1065 return kc->nsleep_restart(restart_block);
1045 (restart_block));
1046} 1066}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a23a57a976d1..f3240e987928 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -214,11 +214,12 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
214 * Ensure that queued callbacks are all executed. 214 * Ensure that queued callbacks are all executed.
215 * If we detect that we are nested in a RCU read-side critical 215 * If we detect that we are nested in a RCU read-side critical
216 * section, we should simply fail, otherwise we would deadlock. 216 * section, we should simply fail, otherwise we would deadlock.
217 * Note that the machinery to reliably determine whether
218 * or not we are in an RCU read-side critical section
219 * exists only in the preemptible RCU implementations
220 * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why
221 * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT.
217 */ 222 */
218#ifndef CONFIG_PREEMPT
219 WARN_ON(1);
220 return 0;
221#else
222 if (rcu_preempt_depth() != 0 || preempt_count() != 0 || 223 if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
223 irqs_disabled()) { 224 irqs_disabled()) {
224 WARN_ON(1); 225 WARN_ON(1);
@@ -229,7 +230,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
229 rcu_barrier_bh(); 230 rcu_barrier_bh();
230 debug_object_free(head, &rcuhead_debug_descr); 231 debug_object_free(head, &rcuhead_debug_descr);
231 return 1; 232 return 1;
232#endif
233 default: 233 default:
234 return 0; 234 return 0;
235 } 235 }
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 015abaea962a..3cb8e362e883 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -852,7 +852,7 @@ void exit_rcu(void)
852 if (t->rcu_read_lock_nesting == 0) 852 if (t->rcu_read_lock_nesting == 0)
853 return; 853 return;
854 t->rcu_read_lock_nesting = 1; 854 t->rcu_read_lock_nesting = 1;
855 rcu_read_unlock(); 855 __rcu_read_unlock();
856} 856}
857 857
858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ 858#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 89613f97ff26..c224da41890c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,7 +47,6 @@
47#include <linux/srcu.h> 47#include <linux/srcu.h>
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include <asm/byteorder.h> 49#include <asm/byteorder.h>
50#include <linux/sched.h>
51 50
52MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index ddabb54bb5c8..3c7cbc2c33be 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
215 put_pid(waiter->deadlock_task_pid); 215 put_pid(waiter->deadlock_task_pid);
216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); 216 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 217 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
218 TRACE_WARN_ON(waiter->task);
219 memset(waiter, 0x22, sizeof(*waiter)); 218 memset(waiter, 0x22, sizeof(*waiter));
220} 219}
221 220
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 66cb89bc5ef1..5c9ccd380966 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/sched.h> 11#include <linux/sched.h>
12#include <linux/smp_lock.h>
13#include <linux/spinlock.h> 12#include <linux/spinlock.h>
14#include <linux/sysdev.h> 13#include <linux/sysdev.h>
15#include <linux/timer.h> 14#include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
27 int opcode; 26 int opcode;
28 int opdata; 27 int opdata;
29 int mutexes[MAX_RT_TEST_MUTEXES]; 28 int mutexes[MAX_RT_TEST_MUTEXES];
30 int bkl;
31 int event; 29 int event;
32 struct sys_device sysdev; 30 struct sys_device sysdev;
33}; 31};
@@ -46,9 +44,8 @@ enum test_opcodes {
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ 44 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ 45 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ 46 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 RTTEST_LOCKBKL, /* 9 Lock BKL */ 47 /* 9, 10 - reserved for BKL commemoration */
50 RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ 48 RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
51 RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
52 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ 49 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
53 RTTEST_RESET = 99, /* 99 Reset all pending operations */ 50 RTTEST_RESET = 99, /* 99 Reset all pending operations */
54}; 51};
@@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
74 td->mutexes[i] = 0; 71 td->mutexes[i] = 0;
75 } 72 }
76 } 73 }
77
78 if (!lockwakeup && td->bkl == 4) {
79#ifdef CONFIG_LOCK_KERNEL
80 unlock_kernel();
81#endif
82 td->bkl = 0;
83 }
84 return 0; 74 return 0;
85 75
86 case RTTEST_RESETEVENT: 76 case RTTEST_RESETEVENT:
@@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
131 td->mutexes[id] = 0; 121 td->mutexes[id] = 0;
132 return 0; 122 return 0;
133 123
134 case RTTEST_LOCKBKL:
135 if (td->bkl)
136 return 0;
137 td->bkl = 1;
138#ifdef CONFIG_LOCK_KERNEL
139 lock_kernel();
140#endif
141 td->bkl = 4;
142 return 0;
143
144 case RTTEST_UNLOCKBKL:
145 if (td->bkl != 4)
146 break;
147#ifdef CONFIG_LOCK_KERNEL
148 unlock_kernel();
149#endif
150 td->bkl = 0;
151 return 0;
152
153 default: 124 default:
154 break; 125 break;
155 } 126 }
@@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
196 td->event = atomic_add_return(1, &rttest_event); 167 td->event = atomic_add_return(1, &rttest_event);
197 break; 168 break;
198 169
199 case RTTEST_LOCKBKL:
200 default: 170 default:
201 break; 171 break;
202 } 172 }
@@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
229 td->event = atomic_add_return(1, &rttest_event); 199 td->event = atomic_add_return(1, &rttest_event);
230 return; 200 return;
231 201
232 case RTTEST_LOCKBKL:
233 return;
234 default: 202 default:
235 return; 203 return;
236 } 204 }
@@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
380 spin_lock(&rttest_lock); 348 spin_lock(&rttest_lock);
381 349
382 curr += sprintf(curr, 350 curr += sprintf(curr,
383 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", 351 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
384 td->opcode, td->event, tsk->state, 352 td->opcode, td->event, tsk->state,
385 (MAX_RT_PRIO - 1) - tsk->prio, 353 (MAX_RT_PRIO - 1) - tsk->prio,
386 (MAX_RT_PRIO - 1) - tsk->normal_prio, 354 (MAX_RT_PRIO - 1) - tsk->normal_prio,
387 tsk->pi_blocked_on, td->bkl); 355 tsk->pi_blocked_on);
388 356
389 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) 357 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
390 curr += sprintf(curr, "%d", td->mutexes[i]); 358 curr += sprintf(curr, "%d", td->mutexes[i]);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a9604815786a..ab449117aaf2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -20,41 +20,34 @@
20/* 20/*
21 * lock->owner state tracking: 21 * lock->owner state tracking:
22 * 22 *
23 * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 23 * lock->owner holds the task_struct pointer of the owner. Bit 0
24 * are used to keep track of the "owner is pending" and "lock has 24 * is used to keep track of the "lock has waiters" state.
25 * waiters" state.
26 * 25 *
27 * owner bit1 bit0 26 * owner bit0
28 * NULL 0 0 lock is free (fast acquire possible) 27 * NULL 0 lock is free (fast acquire possible)
29 * NULL 0 1 invalid state 28 * NULL 1 lock is free and has waiters and the top waiter
30 * NULL 1 0 Transitional State* 29 * is going to take the lock*
31 * NULL 1 1 invalid state 30 * taskpointer 0 lock is held (fast release possible)
32 * taskpointer 0 0 lock is held (fast release possible) 31 * taskpointer 1 lock is held and has waiters**
33 * taskpointer 0 1 task is pending owner
34 * taskpointer 1 0 lock is held and has waiters
35 * taskpointer 1 1 task is pending owner and lock has more waiters
36 *
37 * Pending ownership is assigned to the top (highest priority)
38 * waiter of the lock, when the lock is released. The thread is woken
39 * up and can now take the lock. Until the lock is taken (bit 0
40 * cleared) a competing higher priority thread can steal the lock
41 * which puts the woken up thread back on the waiters list.
42 * 32 *
43 * The fast atomic compare exchange based acquire and release is only 33 * The fast atomic compare exchange based acquire and release is only
44 * possible when bit 0 and 1 of lock->owner are 0. 34 * possible when bit 0 of lock->owner is 0.
35 *
36 * (*) It also can be a transitional state when grabbing the lock
37 * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
38 * we need to set the bit0 before looking at the lock, and the owner may be
39 * NULL in this small time, hence this can be a transitional state.
45 * 40 *
46 * (*) There's a small time where the owner can be NULL and the 41 * (**) There is a small time when bit 0 is set but there are no
47 * "lock has waiters" bit is set. This can happen when grabbing the lock. 42 * waiters. This can happen when grabbing the lock in the slow path.
48 * To prevent a cmpxchg of the owner releasing the lock, we need to set this 43 * To prevent a cmpxchg of the owner releasing the lock, we need to
49 * bit before looking at the lock, hence the reason this is a transitional 44 * set this bit before looking at the lock.
50 * state.
51 */ 45 */
52 46
53static void 47static void
54rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, 48rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
55 unsigned long mask)
56{ 49{
57 unsigned long val = (unsigned long)owner | mask; 50 unsigned long val = (unsigned long)owner;
58 51
59 if (rt_mutex_has_waiters(lock)) 52 if (rt_mutex_has_waiters(lock))
60 val |= RT_MUTEX_HAS_WAITERS; 53 val |= RT_MUTEX_HAS_WAITERS;
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
203 * reached or the state of the chain has changed while we 196 * reached or the state of the chain has changed while we
204 * dropped the locks. 197 * dropped the locks.
205 */ 198 */
206 if (!waiter || !waiter->task) 199 if (!waiter)
207 goto out_unlock_pi; 200 goto out_unlock_pi;
208 201
209 /* 202 /*
210 * Check the orig_waiter state. After we dropped the locks, 203 * Check the orig_waiter state. After we dropped the locks,
211 * the previous owner of the lock might have released the lock 204 * the previous owner of the lock might have released the lock.
212 * and made us the pending owner:
213 */ 205 */
214 if (orig_waiter && !orig_waiter->task) 206 if (orig_waiter && !rt_mutex_owner(orig_lock))
215 goto out_unlock_pi; 207 goto out_unlock_pi;
216 208
217 /* 209 /*
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
254 246
255 /* Release the task */ 247 /* Release the task */
256 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 248 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
249 if (!rt_mutex_owner(lock)) {
250 /*
251 * If the requeue above changed the top waiter, then we need
252 * to wake the new top waiter up to try to get the lock.
253 */
254
255 if (top_waiter != rt_mutex_top_waiter(lock))
256 wake_up_process(rt_mutex_top_waiter(lock)->task);
257 raw_spin_unlock(&lock->wait_lock);
258 goto out_put_task;
259 }
257 put_task_struct(task); 260 put_task_struct(task);
258 261
259 /* Grab the next task */ 262 /* Grab the next task */
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
296} 299}
297 300
298/* 301/*
299 * Optimization: check if we can steal the lock from the
300 * assigned pending owner [which might not have taken the
301 * lock yet]:
302 */
303static inline int try_to_steal_lock(struct rt_mutex *lock,
304 struct task_struct *task)
305{
306 struct task_struct *pendowner = rt_mutex_owner(lock);
307 struct rt_mutex_waiter *next;
308 unsigned long flags;
309
310 if (!rt_mutex_owner_pending(lock))
311 return 0;
312
313 if (pendowner == task)
314 return 1;
315
316 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
317 if (task->prio >= pendowner->prio) {
318 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 0;
320 }
321
322 /*
323 * Check if a waiter is enqueued on the pending owners
324 * pi_waiters list. Remove it and readjust pending owners
325 * priority.
326 */
327 if (likely(!rt_mutex_has_waiters(lock))) {
328 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329 return 1;
330 }
331
332 /* No chain handling, pending owner is not blocked on anything: */
333 next = rt_mutex_top_waiter(lock);
334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
335 __rt_mutex_adjust_prio(pendowner);
336 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
337
338 /*
339 * We are going to steal the lock and a waiter was
340 * enqueued on the pending owners pi_waiters queue. So
341 * we have to enqueue this waiter into
342 * task->pi_waiters list. This covers the case,
343 * where task is boosted because it holds another
344 * lock and gets unboosted because the booster is
345 * interrupted, so we would delay a waiter with higher
346 * priority as task->normal_prio.
347 *
348 * Note: in the rare case of a SCHED_OTHER task changing
349 * its priority and thus stealing the lock, next->task
350 * might be task:
351 */
352 if (likely(next->task != task)) {
353 raw_spin_lock_irqsave(&task->pi_lock, flags);
354 plist_add(&next->pi_list_entry, &task->pi_waiters);
355 __rt_mutex_adjust_prio(task);
356 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
357 }
358 return 1;
359}
360
361/*
362 * Try to take an rt-mutex 302 * Try to take an rt-mutex
363 * 303 *
364 * This fails
365 * - when the lock has a real owner
366 * - when a different pending owner exists and has higher priority than current
367 *
368 * Must be called with lock->wait_lock held. 304 * Must be called with lock->wait_lock held.
305 *
306 * @lock: the lock to be acquired.
307 * @task: the task which wants to acquire the lock
308 * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
369 */ 309 */
370static int try_to_take_rt_mutex(struct rt_mutex *lock) 310static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
311 struct rt_mutex_waiter *waiter)
371{ 312{
372 /* 313 /*
373 * We have to be careful here if the atomic speedups are 314 * We have to be careful here if the atomic speedups are
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
390 */ 331 */
391 mark_rt_mutex_waiters(lock); 332 mark_rt_mutex_waiters(lock);
392 333
393 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) 334 if (rt_mutex_owner(lock))
394 return 0; 335 return 0;
395 336
337 /*
338 * It will get the lock because of one of these conditions:
339 * 1) there is no waiter
340 * 2) higher priority than waiters
341 * 3) it is top waiter
342 */
343 if (rt_mutex_has_waiters(lock)) {
344 if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
345 if (!waiter || waiter != rt_mutex_top_waiter(lock))
346 return 0;
347 }
348 }
349
350 if (waiter || rt_mutex_has_waiters(lock)) {
351 unsigned long flags;
352 struct rt_mutex_waiter *top;
353
354 raw_spin_lock_irqsave(&task->pi_lock, flags);
355
356 /* remove the queued waiter. */
357 if (waiter) {
358 plist_del(&waiter->list_entry, &lock->wait_list);
359 task->pi_blocked_on = NULL;
360 }
361
362 /*
363 * We have to enqueue the top waiter(if it exists) into
364 * task->pi_waiters list.
365 */
366 if (rt_mutex_has_waiters(lock)) {
367 top = rt_mutex_top_waiter(lock);
368 top->pi_list_entry.prio = top->list_entry.prio;
369 plist_add(&top->pi_list_entry, &task->pi_waiters);
370 }
371 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
372 }
373
396 /* We got the lock. */ 374 /* We got the lock. */
397 debug_rt_mutex_lock(lock); 375 debug_rt_mutex_lock(lock);
398 376
399 rt_mutex_set_owner(lock, current, 0); 377 rt_mutex_set_owner(lock, task);
400 378
401 rt_mutex_deadlock_account_lock(lock, current); 379 rt_mutex_deadlock_account_lock(lock, task);
402 380
403 return 1; 381 return 1;
404} 382}
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
436 414
437 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 415 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
438 416
417 if (!owner)
418 return 0;
419
439 if (waiter == rt_mutex_top_waiter(lock)) { 420 if (waiter == rt_mutex_top_waiter(lock)) {
440 raw_spin_lock_irqsave(&owner->pi_lock, flags); 421 raw_spin_lock_irqsave(&owner->pi_lock, flags);
441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 422 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
472/* 453/*
473 * Wake up the next waiter on the lock. 454 * Wake up the next waiter on the lock.
474 * 455 *
475 * Remove the top waiter from the current tasks waiter list and from 456 * Remove the top waiter from the current tasks waiter list and wake it up.
476 * the lock waiter list. Set it as pending owner. Then wake it up.
477 * 457 *
478 * Called with lock->wait_lock held. 458 * Called with lock->wait_lock held.
479 */ 459 */
480static void wakeup_next_waiter(struct rt_mutex *lock) 460static void wakeup_next_waiter(struct rt_mutex *lock)
481{ 461{
482 struct rt_mutex_waiter *waiter; 462 struct rt_mutex_waiter *waiter;
483 struct task_struct *pendowner;
484 unsigned long flags; 463 unsigned long flags;
485 464
486 raw_spin_lock_irqsave(&current->pi_lock, flags); 465 raw_spin_lock_irqsave(&current->pi_lock, flags);
487 466
488 waiter = rt_mutex_top_waiter(lock); 467 waiter = rt_mutex_top_waiter(lock);
489 plist_del(&waiter->list_entry, &lock->wait_list);
490 468
491 /* 469 /*
492 * Remove it from current->pi_waiters. We do not adjust a 470 * Remove it from current->pi_waiters. We do not adjust a
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
495 * lock->wait_lock. 473 * lock->wait_lock.
496 */ 474 */
497 plist_del(&waiter->pi_list_entry, &current->pi_waiters); 475 plist_del(&waiter->pi_list_entry, &current->pi_waiters);
498 pendowner = waiter->task;
499 waiter->task = NULL;
500 476
501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); 477 rt_mutex_set_owner(lock, NULL);
502 478
503 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 479 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
504 480
505 /* 481 wake_up_process(waiter->task);
506 * Clear the pi_blocked_on variable and enqueue a possible
507 * waiter into the pi_waiters list of the pending owner. This
508 * prevents that in case the pending owner gets unboosted a
509 * waiter with higher priority than pending-owner->normal_prio
510 * is blocked on the unboosted (pending) owner.
511 */
512 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
513
514 WARN_ON(!pendowner->pi_blocked_on);
515 WARN_ON(pendowner->pi_blocked_on != waiter);
516 WARN_ON(pendowner->pi_blocked_on->lock != lock);
517
518 pendowner->pi_blocked_on = NULL;
519
520 if (rt_mutex_has_waiters(lock)) {
521 struct rt_mutex_waiter *next;
522
523 next = rt_mutex_top_waiter(lock);
524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
525 }
526 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
527
528 wake_up_process(pendowner);
529} 482}
530 483
531/* 484/*
532 * Remove a waiter from a lock 485 * Remove a waiter from a lock and give up
533 * 486 *
534 * Must be called with lock->wait_lock held 487 * Must be called with lock->wait_lock held and
488 * have just failed to try_to_take_rt_mutex().
535 */ 489 */
536static void remove_waiter(struct rt_mutex *lock, 490static void remove_waiter(struct rt_mutex *lock,
537 struct rt_mutex_waiter *waiter) 491 struct rt_mutex_waiter *waiter)
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock,
543 497
544 raw_spin_lock_irqsave(&current->pi_lock, flags); 498 raw_spin_lock_irqsave(&current->pi_lock, flags);
545 plist_del(&waiter->list_entry, &lock->wait_list); 499 plist_del(&waiter->list_entry, &lock->wait_list);
546 waiter->task = NULL;
547 current->pi_blocked_on = NULL; 500 current->pi_blocked_on = NULL;
548 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 501 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
549 502
550 if (first && owner != current) { 503 if (!owner)
504 return;
505
506 if (first) {
551 507
552 raw_spin_lock_irqsave(&owner->pi_lock, flags); 508 raw_spin_lock_irqsave(&owner->pi_lock, flags);
553 509
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task)
614 * or TASK_UNINTERRUPTIBLE) 570 * or TASK_UNINTERRUPTIBLE)
615 * @timeout: the pre-initialized and started timer, or NULL for none 571 * @timeout: the pre-initialized and started timer, or NULL for none
616 * @waiter: the pre-initialized rt_mutex_waiter 572 * @waiter: the pre-initialized rt_mutex_waiter
617 * @detect_deadlock: passed to task_blocks_on_rt_mutex
618 * 573 *
619 * lock->wait_lock must be held by the caller. 574 * lock->wait_lock must be held by the caller.
620 */ 575 */
621static int __sched 576static int __sched
622__rt_mutex_slowlock(struct rt_mutex *lock, int state, 577__rt_mutex_slowlock(struct rt_mutex *lock, int state,
623 struct hrtimer_sleeper *timeout, 578 struct hrtimer_sleeper *timeout,
624 struct rt_mutex_waiter *waiter, 579 struct rt_mutex_waiter *waiter)
625 int detect_deadlock)
626{ 580{
627 int ret = 0; 581 int ret = 0;
628 582
629 for (;;) { 583 for (;;) {
630 /* Try to acquire the lock: */ 584 /* Try to acquire the lock: */
631 if (try_to_take_rt_mutex(lock)) 585 if (try_to_take_rt_mutex(lock, current, waiter))
632 break; 586 break;
633 587
634 /* 588 /*
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
645 break; 599 break;
646 } 600 }
647 601
648 /*
649 * waiter->task is NULL the first time we come here and
650 * when we have been woken up by the previous owner
651 * but the lock got stolen by a higher prio task.
652 */
653 if (!waiter->task) {
654 ret = task_blocks_on_rt_mutex(lock, waiter, current,
655 detect_deadlock);
656 /*
657 * If we got woken up by the owner then start loop
658 * all over without going into schedule to try
659 * to get the lock now:
660 */
661 if (unlikely(!waiter->task)) {
662 /*
663 * Reset the return value. We might
664 * have returned with -EDEADLK and the
665 * owner released the lock while we
666 * were walking the pi chain.
667 */
668 ret = 0;
669 continue;
670 }
671 if (unlikely(ret))
672 break;
673 }
674
675 raw_spin_unlock(&lock->wait_lock); 602 raw_spin_unlock(&lock->wait_lock);
676 603
677 debug_rt_mutex_print_deadlock(waiter); 604 debug_rt_mutex_print_deadlock(waiter);
678 605
679 if (waiter->task) 606 schedule_rt_mutex(lock);
680 schedule_rt_mutex(lock);
681 607
682 raw_spin_lock(&lock->wait_lock); 608 raw_spin_lock(&lock->wait_lock);
683 set_current_state(state); 609 set_current_state(state);
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
698 int ret = 0; 624 int ret = 0;
699 625
700 debug_rt_mutex_init_waiter(&waiter); 626 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL;
702 627
703 raw_spin_lock(&lock->wait_lock); 628 raw_spin_lock(&lock->wait_lock);
704 629
705 /* Try to acquire the lock again: */ 630 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) { 631 if (try_to_take_rt_mutex(lock, current, NULL)) {
707 raw_spin_unlock(&lock->wait_lock); 632 raw_spin_unlock(&lock->wait_lock);
708 return 0; 633 return 0;
709 } 634 }
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
717 timeout->task = NULL; 642 timeout->task = NULL;
718 } 643 }
719 644
720 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, 645 ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
721 detect_deadlock); 646
647 if (likely(!ret))
648 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
722 649
723 set_current_state(TASK_RUNNING); 650 set_current_state(TASK_RUNNING);
724 651
725 if (unlikely(waiter.task)) 652 if (unlikely(ret))
726 remove_waiter(lock, &waiter); 653 remove_waiter(lock, &waiter);
727 654
728 /* 655 /*
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
737 if (unlikely(timeout)) 664 if (unlikely(timeout))
738 hrtimer_cancel(&timeout->timer); 665 hrtimer_cancel(&timeout->timer);
739 666
740 /*
741 * Readjust priority, when we did not get the lock. We might
742 * have been the pending owner and boosted. Since we did not
743 * take the lock, the PI boost has to go.
744 */
745 if (unlikely(ret))
746 rt_mutex_adjust_prio(current);
747
748 debug_rt_mutex_free_waiter(&waiter); 667 debug_rt_mutex_free_waiter(&waiter);
749 668
750 return ret; 669 return ret;
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
762 681
763 if (likely(rt_mutex_owner(lock) != current)) { 682 if (likely(rt_mutex_owner(lock) != current)) {
764 683
765 ret = try_to_take_rt_mutex(lock); 684 ret = try_to_take_rt_mutex(lock, current, NULL);
766 /* 685 /*
767 * try_to_take_rt_mutex() sets the lock waiters 686 * try_to_take_rt_mutex() sets the lock waiters
768 * bit unconditionally. Clean this up. 687 * bit unconditionally. Clean this up.
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
992{ 911{
993 __rt_mutex_init(lock, NULL); 912 __rt_mutex_init(lock, NULL);
994 debug_rt_mutex_proxy_lock(lock, proxy_owner); 913 debug_rt_mutex_proxy_lock(lock, proxy_owner);
995 rt_mutex_set_owner(lock, proxy_owner, 0); 914 rt_mutex_set_owner(lock, proxy_owner);
996 rt_mutex_deadlock_account_lock(lock, proxy_owner); 915 rt_mutex_deadlock_account_lock(lock, proxy_owner);
997} 916}
998 917
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
1008 struct task_struct *proxy_owner) 927 struct task_struct *proxy_owner)
1009{ 928{
1010 debug_rt_mutex_proxy_unlock(lock); 929 debug_rt_mutex_proxy_unlock(lock);
1011 rt_mutex_set_owner(lock, NULL, 0); 930 rt_mutex_set_owner(lock, NULL);
1012 rt_mutex_deadlock_account_unlock(proxy_owner); 931 rt_mutex_deadlock_account_unlock(proxy_owner);
1013} 932}
1014 933
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1034 953
1035 raw_spin_lock(&lock->wait_lock); 954 raw_spin_lock(&lock->wait_lock);
1036 955
1037 mark_rt_mutex_waiters(lock); 956 if (try_to_take_rt_mutex(lock, task, NULL)) {
1038
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0);
1043 raw_spin_unlock(&lock->wait_lock); 957 raw_spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1; 958 return 1;
1046 } 959 }
1047 960
1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 961 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1049 962
1050 if (ret && !waiter->task) { 963 if (ret && !rt_mutex_owner(lock)) {
1051 /* 964 /*
1052 * Reset the return value. We might have 965 * Reset the return value. We might have
1053 * returned with -EDEADLK and the owner 966 * returned with -EDEADLK and the owner
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1056 */ 969 */
1057 ret = 0; 970 ret = 0;
1058 } 971 }
972
973 if (unlikely(ret))
974 remove_waiter(lock, waiter);
975
1059 raw_spin_unlock(&lock->wait_lock); 976 raw_spin_unlock(&lock->wait_lock);
1060 977
1061 debug_rt_mutex_print_deadlock(waiter); 978 debug_rt_mutex_print_deadlock(waiter);
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1110 1027
1111 set_current_state(TASK_INTERRUPTIBLE); 1028 set_current_state(TASK_INTERRUPTIBLE);
1112 1029
1113 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, 1030 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
1114 detect_deadlock);
1115 1031
1116 set_current_state(TASK_RUNNING); 1032 set_current_state(TASK_RUNNING);
1117 1033
1118 if (unlikely(waiter->task)) 1034 if (unlikely(ret))
1119 remove_waiter(lock, waiter); 1035 remove_waiter(lock, waiter);
1120 1036
1121 /* 1037 /*
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1126 1042
1127 raw_spin_unlock(&lock->wait_lock); 1043 raw_spin_unlock(&lock->wait_lock);
1128 1044
1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been
1131 * the pending owner and boosted. Since we did not take the lock, the
1132 * PI boost has to go.
1133 */
1134 if (unlikely(ret))
1135 rt_mutex_adjust_prio(current);
1136
1137 return ret; 1045 return ret;
1138} 1046}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81866af..53a66c85261b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p)
91/* 91/*
92 * lock->owner state tracking: 92 * lock->owner state tracking:
93 */ 93 */
94#define RT_MUTEX_OWNER_PENDING 1UL 94#define RT_MUTEX_HAS_WAITERS 1UL
95#define RT_MUTEX_HAS_WAITERS 2UL 95#define RT_MUTEX_OWNER_MASKALL 1UL
96#define RT_MUTEX_OWNER_MASKALL 3UL
97 96
98static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) 97static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
99{ 98{
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
101 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); 100 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
102} 101}
103 102
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{
106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108}
109
110static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
111{
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113}
114
115/* 103/*
116 * PI-futex support (proxy locking functions, etc.): 104 * PI-futex support (proxy locking functions, etc.):
117 */ 105 */
diff --git a/kernel/sched.c b/kernel/sched.c
index 42eab5a8437d..c8e40b7005c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -324,7 +324,7 @@ struct cfs_rq {
324 * 'curr' points to currently running entity on this cfs_rq. 324 * 'curr' points to currently running entity on this cfs_rq.
325 * It is set to NULL otherwise (i.e when none are currently running). 325 * It is set to NULL otherwise (i.e when none are currently running).
326 */ 326 */
327 struct sched_entity *curr, *next, *last; 327 struct sched_entity *curr, *next, *last, *skip;
328 328
329 unsigned int nr_spread_over; 329 unsigned int nr_spread_over;
330 330
@@ -606,9 +606,6 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct task_group *tg; 606 struct task_group *tg;
607 struct cgroup_subsys_state *css; 607 struct cgroup_subsys_state *css;
608 608
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 609 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
613 lockdep_is_held(&task_rq(p)->lock)); 610 lockdep_is_held(&task_rq(p)->lock));
614 tg = container_of(css, struct task_group, css); 611 tg = container_of(css, struct task_group, css);
@@ -1686,6 +1683,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1686 __release(rq2->lock); 1683 __release(rq2->lock);
1687} 1684}
1688 1685
1686#else /* CONFIG_SMP */
1687
1688/*
1689 * double_rq_lock - safely lock two runqueues
1690 *
1691 * Note this does not disable interrupts like task_rq_lock,
1692 * you need to do so manually before calling.
1693 */
1694static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1695 __acquires(rq1->lock)
1696 __acquires(rq2->lock)
1697{
1698 BUG_ON(!irqs_disabled());
1699 BUG_ON(rq1 != rq2);
1700 raw_spin_lock(&rq1->lock);
1701 __acquire(rq2->lock); /* Fake it out ;) */
1702}
1703
1704/*
1705 * double_rq_unlock - safely unlock two runqueues
1706 *
1707 * Note this does not restore interrupts like task_rq_unlock,
1708 * you need to do so manually after calling.
1709 */
1710static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1711 __releases(rq1->lock)
1712 __releases(rq2->lock)
1713{
1714 BUG_ON(rq1 != rq2);
1715 raw_spin_unlock(&rq1->lock);
1716 __release(rq2->lock);
1717}
1718
1689#endif 1719#endif
1690 1720
1691static void calc_load_account_idle(struct rq *this_rq); 1721static void calc_load_account_idle(struct rq *this_rq);
@@ -1880,7 +1910,7 @@ void account_system_vtime(struct task_struct *curr)
1880 */ 1910 */
1881 if (hardirq_count()) 1911 if (hardirq_count())
1882 __this_cpu_add(cpu_hardirq_time, delta); 1912 __this_cpu_add(cpu_hardirq_time, delta);
1883 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1913 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1884 __this_cpu_add(cpu_softirq_time, delta); 1914 __this_cpu_add(cpu_softirq_time, delta);
1885 1915
1886 irq_time_write_end(); 1916 irq_time_write_end();
@@ -1920,8 +1950,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1920 sched_rt_avg_update(rq, irq_delta); 1950 sched_rt_avg_update(rq, irq_delta);
1921} 1951}
1922 1952
1953static int irqtime_account_hi_update(void)
1954{
1955 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1956 unsigned long flags;
1957 u64 latest_ns;
1958 int ret = 0;
1959
1960 local_irq_save(flags);
1961 latest_ns = this_cpu_read(cpu_hardirq_time);
1962 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1963 ret = 1;
1964 local_irq_restore(flags);
1965 return ret;
1966}
1967
1968static int irqtime_account_si_update(void)
1969{
1970 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1971 unsigned long flags;
1972 u64 latest_ns;
1973 int ret = 0;
1974
1975 local_irq_save(flags);
1976 latest_ns = this_cpu_read(cpu_softirq_time);
1977 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
1978 ret = 1;
1979 local_irq_restore(flags);
1980 return ret;
1981}
1982
1923#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 1983#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1924 1984
1985#define sched_clock_irqtime (0)
1986
1925static void update_rq_clock_task(struct rq *rq, s64 delta) 1987static void update_rq_clock_task(struct rq *rq, s64 delta)
1926{ 1988{
1927 rq->clock_task += delta; 1989 rq->clock_task += delta;
@@ -2025,14 +2087,14 @@ inline int task_curr(const struct task_struct *p)
2025 2087
2026static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2088static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2027 const struct sched_class *prev_class, 2089 const struct sched_class *prev_class,
2028 int oldprio, int running) 2090 int oldprio)
2029{ 2091{
2030 if (prev_class != p->sched_class) { 2092 if (prev_class != p->sched_class) {
2031 if (prev_class->switched_from) 2093 if (prev_class->switched_from)
2032 prev_class->switched_from(rq, p, running); 2094 prev_class->switched_from(rq, p);
2033 p->sched_class->switched_to(rq, p, running); 2095 p->sched_class->switched_to(rq, p);
2034 } else 2096 } else if (oldprio != p->prio)
2035 p->sched_class->prio_changed(rq, p, oldprio, running); 2097 p->sched_class->prio_changed(rq, p, oldprio);
2036} 2098}
2037 2099
2038static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2100static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2224,7 +2286,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2224 * yield - it could be a while. 2286 * yield - it could be a while.
2225 */ 2287 */
2226 if (unlikely(on_rq)) { 2288 if (unlikely(on_rq)) {
2227 schedule_timeout_uninterruptible(1); 2289 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2290
2291 set_current_state(TASK_UNINTERRUPTIBLE);
2292 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2228 continue; 2293 continue;
2229 } 2294 }
2230 2295
@@ -2265,27 +2330,6 @@ void kick_process(struct task_struct *p)
2265EXPORT_SYMBOL_GPL(kick_process); 2330EXPORT_SYMBOL_GPL(kick_process);
2266#endif /* CONFIG_SMP */ 2331#endif /* CONFIG_SMP */
2267 2332
2268/**
2269 * task_oncpu_function_call - call a function on the cpu on which a task runs
2270 * @p: the task to evaluate
2271 * @func: the function to be called
2272 * @info: the function call argument
2273 *
2274 * Calls the function @func when the task is currently running. This might
2275 * be on the current CPU, which just calls the function directly
2276 */
2277void task_oncpu_function_call(struct task_struct *p,
2278 void (*func) (void *info), void *info)
2279{
2280 int cpu;
2281
2282 preempt_disable();
2283 cpu = task_cpu(p);
2284 if (task_curr(p))
2285 smp_call_function_single(cpu, func, info, 1);
2286 preempt_enable();
2287}
2288
2289#ifdef CONFIG_SMP 2333#ifdef CONFIG_SMP
2290/* 2334/*
2291 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2335 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -2566,6 +2610,7 @@ static void __sched_fork(struct task_struct *p)
2566 p->se.sum_exec_runtime = 0; 2610 p->se.sum_exec_runtime = 0;
2567 p->se.prev_sum_exec_runtime = 0; 2611 p->se.prev_sum_exec_runtime = 0;
2568 p->se.nr_migrations = 0; 2612 p->se.nr_migrations = 0;
2613 p->se.vruntime = 0;
2569 2614
2570#ifdef CONFIG_SCHEDSTATS 2615#ifdef CONFIG_SCHEDSTATS
2571 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2616 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2776,9 +2821,12 @@ static inline void
2776prepare_task_switch(struct rq *rq, struct task_struct *prev, 2821prepare_task_switch(struct rq *rq, struct task_struct *prev,
2777 struct task_struct *next) 2822 struct task_struct *next)
2778{ 2823{
2824 sched_info_switch(prev, next);
2825 perf_event_task_sched_out(prev, next);
2779 fire_sched_out_preempt_notifiers(prev, next); 2826 fire_sched_out_preempt_notifiers(prev, next);
2780 prepare_lock_switch(rq, next); 2827 prepare_lock_switch(rq, next);
2781 prepare_arch_switch(next); 2828 prepare_arch_switch(next);
2829 trace_sched_switch(prev, next);
2782} 2830}
2783 2831
2784/** 2832/**
@@ -2911,7 +2959,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2911 struct mm_struct *mm, *oldmm; 2959 struct mm_struct *mm, *oldmm;
2912 2960
2913 prepare_task_switch(rq, prev, next); 2961 prepare_task_switch(rq, prev, next);
2914 trace_sched_switch(prev, next); 2962
2915 mm = next->mm; 2963 mm = next->mm;
2916 oldmm = prev->active_mm; 2964 oldmm = prev->active_mm;
2917 /* 2965 /*
@@ -3568,6 +3616,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3568} 3616}
3569 3617
3570/* 3618/*
3619 * Account system cpu time to a process and desired cpustat field
3620 * @p: the process that the cpu time gets accounted to
3621 * @cputime: the cpu time spent in kernel space since the last update
3622 * @cputime_scaled: cputime scaled by cpu frequency
3623 * @target_cputime64: pointer to cpustat field that has to be updated
3624 */
3625static inline
3626void __account_system_time(struct task_struct *p, cputime_t cputime,
3627 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3628{
3629 cputime64_t tmp = cputime_to_cputime64(cputime);
3630
3631 /* Add system time to process. */
3632 p->stime = cputime_add(p->stime, cputime);
3633 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3634 account_group_system_time(p, cputime);
3635
3636 /* Add system time to cpustat. */
3637 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3638 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3639
3640 /* Account for system time used */
3641 acct_update_integrals(p);
3642}
3643
3644/*
3571 * Account system cpu time to a process. 3645 * Account system cpu time to a process.
3572 * @p: the process that the cpu time gets accounted to 3646 * @p: the process that the cpu time gets accounted to
3573 * @hardirq_offset: the offset to subtract from hardirq_count() 3647 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3578,36 +3652,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3578 cputime_t cputime, cputime_t cputime_scaled) 3652 cputime_t cputime, cputime_t cputime_scaled)
3579{ 3653{
3580 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3654 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3581 cputime64_t tmp; 3655 cputime64_t *target_cputime64;
3582 3656
3583 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3657 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3584 account_guest_time(p, cputime, cputime_scaled); 3658 account_guest_time(p, cputime, cputime_scaled);
3585 return; 3659 return;
3586 } 3660 }
3587 3661
3588 /* Add system time to process. */
3589 p->stime = cputime_add(p->stime, cputime);
3590 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3591 account_group_system_time(p, cputime);
3592
3593 /* Add system time to cpustat. */
3594 tmp = cputime_to_cputime64(cputime);
3595 if (hardirq_count() - hardirq_offset) 3662 if (hardirq_count() - hardirq_offset)
3596 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3663 target_cputime64 = &cpustat->irq;
3597 else if (in_serving_softirq()) 3664 else if (in_serving_softirq())
3598 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3665 target_cputime64 = &cpustat->softirq;
3599 else 3666 else
3600 cpustat->system = cputime64_add(cpustat->system, tmp); 3667 target_cputime64 = &cpustat->system;
3601 3668
3602 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 3669 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3603
3604 /* Account for system time used */
3605 acct_update_integrals(p);
3606} 3670}
3607 3671
3608/* 3672/*
3609 * Account for involuntary wait time. 3673 * Account for involuntary wait time.
3610 * @steal: the cpu time spent in involuntary wait 3674 * @cputime: the cpu time spent in involuntary wait
3611 */ 3675 */
3612void account_steal_time(cputime_t cputime) 3676void account_steal_time(cputime_t cputime)
3613{ 3677{
@@ -3635,6 +3699,73 @@ void account_idle_time(cputime_t cputime)
3635 3699
3636#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3700#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3637 3701
3702#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3703/*
3704 * Account a tick to a process and cpustat
3705 * @p: the process that the cpu time gets accounted to
3706 * @user_tick: is the tick from userspace
3707 * @rq: the pointer to rq
3708 *
3709 * Tick demultiplexing follows the order
3710 * - pending hardirq update
3711 * - pending softirq update
3712 * - user_time
3713 * - idle_time
3714 * - system time
3715 * - check for guest_time
3716 * - else account as system_time
3717 *
3718 * Check for hardirq is done both for system and user time as there is
3719 * no timer going off while we are on hardirq and hence we may never get an
3720 * opportunity to update it solely in system time.
3721 * p->stime and friends are only updated on system time and not on irq
3722 * softirq as those do not count in task exec_runtime any more.
3723 */
3724static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3725 struct rq *rq)
3726{
3727 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3728 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3729 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3730
3731 if (irqtime_account_hi_update()) {
3732 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3733 } else if (irqtime_account_si_update()) {
3734 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3735 } else if (this_cpu_ksoftirqd() == p) {
3736 /*
3737 * ksoftirqd time do not get accounted in cpu_softirq_time.
3738 * So, we have to handle it separately here.
3739 * Also, p->stime needs to be updated for ksoftirqd.
3740 */
3741 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3742 &cpustat->softirq);
3743 } else if (user_tick) {
3744 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3745 } else if (p == rq->idle) {
3746 account_idle_time(cputime_one_jiffy);
3747 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3748 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3749 } else {
3750 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3751 &cpustat->system);
3752 }
3753}
3754
3755static void irqtime_account_idle_ticks(int ticks)
3756{
3757 int i;
3758 struct rq *rq = this_rq();
3759
3760 for (i = 0; i < ticks; i++)
3761 irqtime_account_process_tick(current, 0, rq);
3762}
3763#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3764static void irqtime_account_idle_ticks(int ticks) {}
3765static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3766 struct rq *rq) {}
3767#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3768
3638/* 3769/*
3639 * Account a single tick of cpu time. 3770 * Account a single tick of cpu time.
3640 * @p: the process that the cpu time gets accounted to 3771 * @p: the process that the cpu time gets accounted to
@@ -3645,6 +3776,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
3645 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3776 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3646 struct rq *rq = this_rq(); 3777 struct rq *rq = this_rq();
3647 3778
3779 if (sched_clock_irqtime) {
3780 irqtime_account_process_tick(p, user_tick, rq);
3781 return;
3782 }
3783
3648 if (user_tick) 3784 if (user_tick)
3649 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3785 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3650 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3786 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3670,6 +3806,12 @@ void account_steal_ticks(unsigned long ticks)
3670 */ 3806 */
3671void account_idle_ticks(unsigned long ticks) 3807void account_idle_ticks(unsigned long ticks)
3672{ 3808{
3809
3810 if (sched_clock_irqtime) {
3811 irqtime_account_idle_ticks(ticks);
3812 return;
3813 }
3814
3673 account_idle_time(jiffies_to_cputime(ticks)); 3815 account_idle_time(jiffies_to_cputime(ticks));
3674} 3816}
3675 3817
@@ -3989,9 +4131,6 @@ need_resched_nonpreemptible:
3989 rq->skip_clock_update = 0; 4131 rq->skip_clock_update = 0;
3990 4132
3991 if (likely(prev != next)) { 4133 if (likely(prev != next)) {
3992 sched_info_switch(prev, next);
3993 perf_event_task_sched_out(prev, next);
3994
3995 rq->nr_switches++; 4134 rq->nr_switches++;
3996 rq->curr = next; 4135 rq->curr = next;
3997 ++*switch_count; 4136 ++*switch_count;
@@ -4571,11 +4710,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4571 4710
4572 if (running) 4711 if (running)
4573 p->sched_class->set_curr_task(rq); 4712 p->sched_class->set_curr_task(rq);
4574 if (on_rq) { 4713 if (on_rq)
4575 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4714 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4576 4715
4577 check_class_changed(rq, p, prev_class, oldprio, running); 4716 check_class_changed(rq, p, prev_class, oldprio);
4578 }
4579 task_rq_unlock(rq, &flags); 4717 task_rq_unlock(rq, &flags);
4580} 4718}
4581 4719
@@ -4823,12 +4961,15 @@ recheck:
4823 param->sched_priority > rlim_rtprio) 4961 param->sched_priority > rlim_rtprio)
4824 return -EPERM; 4962 return -EPERM;
4825 } 4963 }
4964
4826 /* 4965 /*
4827 * Like positive nice levels, dont allow tasks to 4966 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4828 * move out of SCHED_IDLE either: 4967 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4829 */ 4968 */
4830 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 4969 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4831 return -EPERM; 4970 if (!can_nice(p, TASK_NICE(p)))
4971 return -EPERM;
4972 }
4832 4973
4833 /* can't change other user's priorities */ 4974 /* can't change other user's priorities */
4834 if (!check_same_owner(p)) 4975 if (!check_same_owner(p))
@@ -4903,11 +5044,10 @@ recheck:
4903 5044
4904 if (running) 5045 if (running)
4905 p->sched_class->set_curr_task(rq); 5046 p->sched_class->set_curr_task(rq);
4906 if (on_rq) { 5047 if (on_rq)
4907 activate_task(rq, p, 0); 5048 activate_task(rq, p, 0);
4908 5049
4909 check_class_changed(rq, p, prev_class, oldprio, running); 5050 check_class_changed(rq, p, prev_class, oldprio);
4910 }
4911 __task_rq_unlock(rq); 5051 __task_rq_unlock(rq);
4912 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5052 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4913 5053
@@ -5324,6 +5464,65 @@ void __sched yield(void)
5324} 5464}
5325EXPORT_SYMBOL(yield); 5465EXPORT_SYMBOL(yield);
5326 5466
5467/**
5468 * yield_to - yield the current processor to another thread in
5469 * your thread group, or accelerate that thread toward the
5470 * processor it's on.
5471 *
5472 * It's the caller's job to ensure that the target task struct
5473 * can't go away on us before we can do any checks.
5474 *
5475 * Returns true if we indeed boosted the target task.
5476 */
5477bool __sched yield_to(struct task_struct *p, bool preempt)
5478{
5479 struct task_struct *curr = current;
5480 struct rq *rq, *p_rq;
5481 unsigned long flags;
5482 bool yielded = 0;
5483
5484 local_irq_save(flags);
5485 rq = this_rq();
5486
5487again:
5488 p_rq = task_rq(p);
5489 double_rq_lock(rq, p_rq);
5490 while (task_rq(p) != p_rq) {
5491 double_rq_unlock(rq, p_rq);
5492 goto again;
5493 }
5494
5495 if (!curr->sched_class->yield_to_task)
5496 goto out;
5497
5498 if (curr->sched_class != p->sched_class)
5499 goto out;
5500
5501 if (task_running(p_rq, p) || p->state)
5502 goto out;
5503
5504 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5505 if (yielded) {
5506 schedstat_inc(rq, yld_count);
5507 /*
5508 * Make p's CPU reschedule; pick_next_entity takes care of
5509 * fairness.
5510 */
5511 if (preempt && rq != p_rq)
5512 resched_task(p_rq->curr);
5513 }
5514
5515out:
5516 double_rq_unlock(rq, p_rq);
5517 local_irq_restore(flags);
5518
5519 if (yielded)
5520 schedule();
5521
5522 return yielded;
5523}
5524EXPORT_SYMBOL_GPL(yield_to);
5525
5327/* 5526/*
5328 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5527 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5329 * that process accounting knows that this is a task in IO wait state. 5528 * that process accounting knows that this is a task in IO wait state.
@@ -5572,7 +5771,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5572 * The idle tasks have their own, simple scheduling class: 5771 * The idle tasks have their own, simple scheduling class:
5573 */ 5772 */
5574 idle->sched_class = &idle_sched_class; 5773 idle->sched_class = &idle_sched_class;
5575 ftrace_graph_init_task(idle); 5774 ftrace_graph_init_idle_task(idle, cpu);
5576} 5775}
5577 5776
5578/* 5777/*
@@ -7797,6 +7996,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7797 INIT_LIST_HEAD(&cfs_rq->tasks); 7996 INIT_LIST_HEAD(&cfs_rq->tasks);
7798#ifdef CONFIG_FAIR_GROUP_SCHED 7997#ifdef CONFIG_FAIR_GROUP_SCHED
7799 cfs_rq->rq = rq; 7998 cfs_rq->rq = rq;
7999 /* allow initial update_cfs_load() to truncate */
8000#ifdef CONFIG_SMP
8001 cfs_rq->load_stamp = 1;
8002#endif
7800#endif 8003#endif
7801 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 8004 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7802} 8005}
@@ -8110,6 +8313,8 @@ EXPORT_SYMBOL(__might_sleep);
8110#ifdef CONFIG_MAGIC_SYSRQ 8313#ifdef CONFIG_MAGIC_SYSRQ
8111static void normalize_task(struct rq *rq, struct task_struct *p) 8314static void normalize_task(struct rq *rq, struct task_struct *p)
8112{ 8315{
8316 const struct sched_class *prev_class = p->sched_class;
8317 int old_prio = p->prio;
8113 int on_rq; 8318 int on_rq;
8114 8319
8115 on_rq = p->se.on_rq; 8320 on_rq = p->se.on_rq;
@@ -8120,6 +8325,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8120 activate_task(rq, p, 0); 8325 activate_task(rq, p, 0);
8121 resched_task(rq->curr); 8326 resched_task(rq->curr);
8122 } 8327 }
8328
8329 check_class_changed(rq, p, prev_class, old_prio);
8123} 8330}
8124 8331
8125void normalize_rt_tasks(void) 8332void normalize_rt_tasks(void)
@@ -8511,7 +8718,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8511 /* Propagate contribution to hierarchy */ 8718 /* Propagate contribution to hierarchy */
8512 raw_spin_lock_irqsave(&rq->lock, flags); 8719 raw_spin_lock_irqsave(&rq->lock, flags);
8513 for_each_sched_entity(se) 8720 for_each_sched_entity(se)
8514 update_cfs_shares(group_cfs_rq(se), 0); 8721 update_cfs_shares(group_cfs_rq(se));
8515 raw_spin_unlock_irqrestore(&rq->lock, flags); 8722 raw_spin_unlock_irqrestore(&rq->lock, flags);
8516 } 8723 }
8517 8724
@@ -8885,7 +9092,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8885} 9092}
8886 9093
8887static void 9094static void
8888cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) 9095cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9096 struct cgroup *old_cgrp, struct task_struct *task)
8889{ 9097{
8890 /* 9098 /*
8891 * cgroup_exit() is called in the copy_process() failure path. 9099 * cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 9fb656283157..5946ac515602 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr;
12static void __init autogroup_init(struct task_struct *init_task) 12static void __init autogroup_init(struct task_struct *init_task)
13{ 13{
14 autogroup_default.tg = &root_task_group; 14 autogroup_default.tg = &root_task_group;
15 root_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref); 15 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock); 16 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default; 17 init_task->signal->autogroup = &autogroup_default;
@@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
130 129
131static inline bool task_group_is_autogroup(struct task_group *tg) 130static inline bool task_group_is_autogroup(struct task_group *tg)
132{ 131{
133 return tg != &root_task_group && tg->autogroup; 132 return !!tg->autogroup;
134} 133}
135 134
136static inline struct task_group * 135static inline struct task_group *
@@ -161,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
161 160
162 p->signal->autogroup = autogroup_kref_get(ag); 161 p->signal->autogroup = autogroup_kref_get(ag);
163 162
163 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
164 goto out;
165
164 t = p; 166 t = p;
165 do { 167 do {
166 sched_move_task(t); 168 sched_move_task(t);
167 } while_each_thread(p, t); 169 } while_each_thread(p, t);
168 170
171out:
169 unlock_task_sighand(p, &flags); 172 unlock_task_sighand(p, &flags);
170 autogroup_kref_put(prev); 173 autogroup_kref_put(prev);
171} 174}
@@ -247,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
247{ 250{
248 struct autogroup *ag = autogroup_task_get(p); 251 struct autogroup *ag = autogroup_task_get(p);
249 252
253 if (!task_group_is_autogroup(ag->tg))
254 goto out;
255
250 down_read(&ag->lock); 256 down_read(&ag->lock);
251 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); 257 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
252 up_read(&ag->lock); 258 up_read(&ag->lock);
253 259
260out:
254 autogroup_kref_put(ag); 261 autogroup_kref_put(ag);
255} 262}
256#endif /* CONFIG_PROC_FS */ 263#endif /* CONFIG_PROC_FS */
@@ -258,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
258#ifdef CONFIG_SCHED_DEBUG 265#ifdef CONFIG_SCHED_DEBUG
259static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
260{ 267{
261 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); 268 if (!task_group_is_autogroup(tg))
262
263 if (!enabled || !tg->autogroup)
264 return 0; 269 return 0;
265 270
266 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 271 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 7b859ffe5dad..05577055cfca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -1,6 +1,11 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3struct autogroup { 3struct autogroup {
4 /*
5 * reference doesn't mean how many thread attach to this
6 * autogroup now. It just stands for the number of task
7 * could use this autogroup.
8 */
4 struct kref kref; 9 struct kref kref;
5 struct task_group *tg; 10 struct task_group *tg;
6 struct rw_semaphore lock; 11 struct rw_semaphore lock;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index eb6cb8edd075..7bacd83a4158 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
179 179
180 raw_spin_lock_irqsave(&rq->lock, flags); 180 raw_spin_lock_irqsave(&rq->lock, flags);
181 if (cfs_rq->rb_leftmost) 181 if (cfs_rq->rb_leftmost)
182 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 182 MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
183 last = __pick_last_entity(cfs_rq); 183 last = __pick_last_entity(cfs_rq);
184 if (last) 184 if (last)
185 max_vruntime = last->vruntime; 185 max_vruntime = last->vruntime;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c26e2df450e..3f7ec9e27ee1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -69,14 +69,6 @@ static unsigned int sched_nr_latency = 8;
69unsigned int sysctl_sched_child_runs_first __read_mostly; 69unsigned int sysctl_sched_child_runs_first __read_mostly;
70 70
71/* 71/*
72 * sys_sched_yield() compat mode
73 *
74 * This option switches the agressive yield implementation of the
75 * old scheduler back on.
76 */
77unsigned int __read_mostly sysctl_sched_compat_yield;
78
79/*
80 * SCHED_OTHER wake-up granularity. 72 * SCHED_OTHER wake-up granularity.
81 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 73 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
82 * 74 *
@@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
419 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 411 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
420} 412}
421 413
422static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 414static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
423{ 415{
424 struct rb_node *left = cfs_rq->rb_leftmost; 416 struct rb_node *left = cfs_rq->rb_leftmost;
425 417
@@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
429 return rb_entry(left, struct sched_entity, run_node); 421 return rb_entry(left, struct sched_entity, run_node);
430} 422}
431 423
424static struct sched_entity *__pick_next_entity(struct sched_entity *se)
425{
426 struct rb_node *next = rb_next(&se->run_node);
427
428 if (!next)
429 return NULL;
430
431 return rb_entry(next, struct sched_entity, run_node);
432}
433
434#ifdef CONFIG_SCHED_DEBUG
432static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 435static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
433{ 436{
434 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 437 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
443 * Scheduling class statistics methods: 446 * Scheduling class statistics methods:
444 */ 447 */
445 448
446#ifdef CONFIG_SCHED_DEBUG
447int sched_proc_update_handler(struct ctl_table *table, int write, 449int sched_proc_update_handler(struct ctl_table *table, int write,
448 void __user *buffer, size_t *lenp, 450 void __user *buffer, size_t *lenp,
449 loff_t *ppos) 451 loff_t *ppos)
@@ -540,7 +542,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
540} 542}
541 543
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); 544static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); 545static void update_cfs_shares(struct cfs_rq *cfs_rq);
544 546
545/* 547/*
546 * Update the current task's runtime statistics. Skip current tasks that 548 * Update the current task's runtime statistics. Skip current tasks that
@@ -733,6 +735,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
733 now - cfs_rq->load_last > 4 * period) { 735 now - cfs_rq->load_last > 4 * period) {
734 cfs_rq->load_period = 0; 736 cfs_rq->load_period = 0;
735 cfs_rq->load_avg = 0; 737 cfs_rq->load_avg = 0;
738 delta = period - 1;
736 } 739 }
737 740
738 cfs_rq->load_stamp = now; 741 cfs_rq->load_stamp = now;
@@ -763,16 +766,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
763 list_del_leaf_cfs_rq(cfs_rq); 766 list_del_leaf_cfs_rq(cfs_rq);
764} 767}
765 768
766static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, 769static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
767 long weight_delta)
768{ 770{
769 long load_weight, load, shares; 771 long load_weight, load, shares;
770 772
771 load = cfs_rq->load.weight + weight_delta; 773 load = cfs_rq->load.weight;
772 774
773 load_weight = atomic_read(&tg->load_weight); 775 load_weight = atomic_read(&tg->load_weight);
774 load_weight -= cfs_rq->load_contribution;
775 load_weight += load; 776 load_weight += load;
777 load_weight -= cfs_rq->load_contribution;
776 778
777 shares = (tg->shares * load); 779 shares = (tg->shares * load);
778 if (load_weight) 780 if (load_weight)
@@ -790,7 +792,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
790{ 792{
791 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { 793 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
792 update_cfs_load(cfs_rq, 0); 794 update_cfs_load(cfs_rq, 0);
793 update_cfs_shares(cfs_rq, 0); 795 update_cfs_shares(cfs_rq);
794 } 796 }
795} 797}
796# else /* CONFIG_SMP */ 798# else /* CONFIG_SMP */
@@ -798,8 +800,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
798{ 800{
799} 801}
800 802
801static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, 803static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
802 long weight_delta)
803{ 804{
804 return tg->shares; 805 return tg->shares;
805} 806}
@@ -824,7 +825,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
824 account_entity_enqueue(cfs_rq, se); 825 account_entity_enqueue(cfs_rq, se);
825} 826}
826 827
827static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 828static void update_cfs_shares(struct cfs_rq *cfs_rq)
828{ 829{
829 struct task_group *tg; 830 struct task_group *tg;
830 struct sched_entity *se; 831 struct sched_entity *se;
@@ -838,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
838 if (likely(se->load.weight == tg->shares)) 839 if (likely(se->load.weight == tg->shares))
839 return; 840 return;
840#endif 841#endif
841 shares = calc_cfs_shares(cfs_rq, tg, weight_delta); 842 shares = calc_cfs_shares(cfs_rq, tg);
842 843
843 reweight_entity(cfs_rq_of(se), se, shares); 844 reweight_entity(cfs_rq_of(se), se, shares);
844} 845}
@@ -847,7 +848,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
847{ 848{
848} 849}
849 850
850static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 851static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
851{ 852{
852} 853}
853 854
@@ -978,8 +979,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
978 */ 979 */
979 update_curr(cfs_rq); 980 update_curr(cfs_rq);
980 update_cfs_load(cfs_rq, 0); 981 update_cfs_load(cfs_rq, 0);
981 update_cfs_shares(cfs_rq, se->load.weight);
982 account_entity_enqueue(cfs_rq, se); 982 account_entity_enqueue(cfs_rq, se);
983 update_cfs_shares(cfs_rq);
983 984
984 if (flags & ENQUEUE_WAKEUP) { 985 if (flags & ENQUEUE_WAKEUP) {
985 place_entity(cfs_rq, se, 0); 986 place_entity(cfs_rq, se, 0);
@@ -996,19 +997,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
996 list_add_leaf_cfs_rq(cfs_rq); 997 list_add_leaf_cfs_rq(cfs_rq);
997} 998}
998 999
999static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1000static void __clear_buddies_last(struct sched_entity *se)
1001{
1002 for_each_sched_entity(se) {
1003 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1004 if (cfs_rq->last == se)
1005 cfs_rq->last = NULL;
1006 else
1007 break;
1008 }
1009}
1010
1011static void __clear_buddies_next(struct sched_entity *se)
1000{ 1012{
1001 if (!se || cfs_rq->last == se) 1013 for_each_sched_entity(se) {
1002 cfs_rq->last = NULL; 1014 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1015 if (cfs_rq->next == se)
1016 cfs_rq->next = NULL;
1017 else
1018 break;
1019 }
1020}
1003 1021
1004 if (!se || cfs_rq->next == se) 1022static void __clear_buddies_skip(struct sched_entity *se)
1005 cfs_rq->next = NULL; 1023{
1024 for_each_sched_entity(se) {
1025 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1026 if (cfs_rq->skip == se)
1027 cfs_rq->skip = NULL;
1028 else
1029 break;
1030 }
1006} 1031}
1007 1032
1008static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1033static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1009{ 1034{
1010 for_each_sched_entity(se) 1035 if (cfs_rq->last == se)
1011 __clear_buddies(cfs_rq_of(se), se); 1036 __clear_buddies_last(se);
1037
1038 if (cfs_rq->next == se)
1039 __clear_buddies_next(se);
1040
1041 if (cfs_rq->skip == se)
1042 __clear_buddies_skip(se);
1012} 1043}
1013 1044
1014static void 1045static void
@@ -1041,7 +1072,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1041 update_cfs_load(cfs_rq, 0); 1072 update_cfs_load(cfs_rq, 0);
1042 account_entity_dequeue(cfs_rq, se); 1073 account_entity_dequeue(cfs_rq, se);
1043 update_min_vruntime(cfs_rq); 1074 update_min_vruntime(cfs_rq);
1044 update_cfs_shares(cfs_rq, 0); 1075 update_cfs_shares(cfs_rq);
1045 1076
1046 /* 1077 /*
1047 * Normalize the entity after updating the min_vruntime because the 1078 * Normalize the entity after updating the min_vruntime because the
@@ -1084,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1084 return; 1115 return;
1085 1116
1086 if (cfs_rq->nr_running > 1) { 1117 if (cfs_rq->nr_running > 1) {
1087 struct sched_entity *se = __pick_next_entity(cfs_rq); 1118 struct sched_entity *se = __pick_first_entity(cfs_rq);
1088 s64 delta = curr->vruntime - se->vruntime; 1119 s64 delta = curr->vruntime - se->vruntime;
1089 1120
1090 if (delta < 0) 1121 if (delta < 0)
@@ -1128,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1128static int 1159static int
1129wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); 1160wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1130 1161
1162/*
1163 * Pick the next process, keeping these things in mind, in this order:
1164 * 1) keep things fair between processes/task groups
1165 * 2) pick the "next" process, since someone really wants that to run
1166 * 3) pick the "last" process, for cache locality
1167 * 4) do not run the "skip" process, if something else is available
1168 */
1131static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 1169static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1132{ 1170{
1133 struct sched_entity *se = __pick_next_entity(cfs_rq); 1171 struct sched_entity *se = __pick_first_entity(cfs_rq);
1134 struct sched_entity *left = se; 1172 struct sched_entity *left = se;
1135 1173
1136 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) 1174 /*
1137 se = cfs_rq->next; 1175 * Avoid running the skip buddy, if running something else can
1176 * be done without getting too unfair.
1177 */
1178 if (cfs_rq->skip == se) {
1179 struct sched_entity *second = __pick_next_entity(se);
1180 if (second && wakeup_preempt_entity(second, left) < 1)
1181 se = second;
1182 }
1138 1183
1139 /* 1184 /*
1140 * Prefer last buddy, try to return the CPU to a preempted task. 1185 * Prefer last buddy, try to return the CPU to a preempted task.
@@ -1142,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1142 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) 1187 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1143 se = cfs_rq->last; 1188 se = cfs_rq->last;
1144 1189
1190 /*
1191 * Someone really wants this to run. If it's not unfair, run it.
1192 */
1193 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1194 se = cfs_rq->next;
1195
1145 clear_buddies(cfs_rq, se); 1196 clear_buddies(cfs_rq, se);
1146 1197
1147 return se; 1198 return se;
@@ -1282,7 +1333,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1282 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1333 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1283 1334
1284 update_cfs_load(cfs_rq, 0); 1335 update_cfs_load(cfs_rq, 0);
1285 update_cfs_shares(cfs_rq, 0); 1336 update_cfs_shares(cfs_rq);
1286 } 1337 }
1287 1338
1288 hrtick_update(rq); 1339 hrtick_update(rq);
@@ -1312,58 +1363,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1312 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1363 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1313 1364
1314 update_cfs_load(cfs_rq, 0); 1365 update_cfs_load(cfs_rq, 0);
1315 update_cfs_shares(cfs_rq, 0); 1366 update_cfs_shares(cfs_rq);
1316 } 1367 }
1317 1368
1318 hrtick_update(rq); 1369 hrtick_update(rq);
1319} 1370}
1320 1371
1321/*
1322 * sched_yield() support is very simple - we dequeue and enqueue.
1323 *
1324 * If compat_yield is turned on then we requeue to the end of the tree.
1325 */
1326static void yield_task_fair(struct rq *rq)
1327{
1328 struct task_struct *curr = rq->curr;
1329 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1330 struct sched_entity *rightmost, *se = &curr->se;
1331
1332 /*
1333 * Are we the only task in the tree?
1334 */
1335 if (unlikely(cfs_rq->nr_running == 1))
1336 return;
1337
1338 clear_buddies(cfs_rq, se);
1339
1340 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
1341 update_rq_clock(rq);
1342 /*
1343 * Update run-time statistics of the 'current'.
1344 */
1345 update_curr(cfs_rq);
1346
1347 return;
1348 }
1349 /*
1350 * Find the rightmost entry in the rbtree:
1351 */
1352 rightmost = __pick_last_entity(cfs_rq);
1353 /*
1354 * Already in the rightmost position?
1355 */
1356 if (unlikely(!rightmost || entity_before(rightmost, se)))
1357 return;
1358
1359 /*
1360 * Minimally necessary key value to be last in the tree:
1361 * Upon rescheduling, sched_class::put_prev_task() will place
1362 * 'current' within the tree based on its new key value.
1363 */
1364 se->vruntime = rightmost->vruntime + 1;
1365}
1366
1367#ifdef CONFIG_SMP 1372#ifdef CONFIG_SMP
1368 1373
1369static void task_waking_fair(struct rq *rq, struct task_struct *p) 1374static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1834,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se)
1834 } 1839 }
1835} 1840}
1836 1841
1842static void set_skip_buddy(struct sched_entity *se)
1843{
1844 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1845 for_each_sched_entity(se)
1846 cfs_rq_of(se)->skip = se;
1847 }
1848}
1849
1837/* 1850/*
1838 * Preempt the current task with a newly woken task if needed: 1851 * Preempt the current task with a newly woken task if needed:
1839 */ 1852 */
@@ -1857,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1857 if (test_tsk_need_resched(curr)) 1870 if (test_tsk_need_resched(curr))
1858 return; 1871 return;
1859 1872
1873 /* Idle tasks are by definition preempted by non-idle tasks. */
1874 if (unlikely(curr->policy == SCHED_IDLE) &&
1875 likely(p->policy != SCHED_IDLE))
1876 goto preempt;
1877
1860 /* 1878 /*
1861 * Batch and idle tasks do not preempt (their preemption is driven by 1879 * Batch and idle tasks do not preempt non-idle tasks (their preemption
1862 * the tick): 1880 * is driven by the tick):
1863 */ 1881 */
1864 if (unlikely(p->policy != SCHED_NORMAL)) 1882 if (unlikely(p->policy != SCHED_NORMAL))
1865 return; 1883 return;
1866 1884
1867 /* Idle tasks are by definition preempted by everybody. */
1868 if (unlikely(curr->policy == SCHED_IDLE))
1869 goto preempt;
1870 1885
1871 if (!sched_feat(WAKEUP_PREEMPT)) 1886 if (!sched_feat(WAKEUP_PREEMPT))
1872 return; 1887 return;
@@ -1932,6 +1947,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1932 } 1947 }
1933} 1948}
1934 1949
1950/*
1951 * sched_yield() is very simple
1952 *
1953 * The magic of dealing with the ->skip buddy is in pick_next_entity.
1954 */
1955static void yield_task_fair(struct rq *rq)
1956{
1957 struct task_struct *curr = rq->curr;
1958 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1959 struct sched_entity *se = &curr->se;
1960
1961 /*
1962 * Are we the only task in the tree?
1963 */
1964 if (unlikely(rq->nr_running == 1))
1965 return;
1966
1967 clear_buddies(cfs_rq, se);
1968
1969 if (curr->policy != SCHED_BATCH) {
1970 update_rq_clock(rq);
1971 /*
1972 * Update run-time statistics of the 'current'.
1973 */
1974 update_curr(cfs_rq);
1975 }
1976
1977 set_skip_buddy(se);
1978}
1979
1980static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
1981{
1982 struct sched_entity *se = &p->se;
1983
1984 if (!se->on_rq)
1985 return false;
1986
1987 /* Tell the scheduler that we'd really like pse to run next. */
1988 set_next_buddy(se);
1989
1990 yield_task_fair(rq);
1991
1992 return true;
1993}
1994
1935#ifdef CONFIG_SMP 1995#ifdef CONFIG_SMP
1936/************************************************** 1996/**************************************************
1937 * Fair scheduling class load-balancing methods: 1997 * Fair scheduling class load-balancing methods:
@@ -2123,7 +2183,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
2123 * We need to update shares after updating tg->load_weight in 2183 * We need to update shares after updating tg->load_weight in
2124 * order to adjust the weight of groups with long running tasks. 2184 * order to adjust the weight of groups with long running tasks.
2125 */ 2185 */
2126 update_cfs_shares(cfs_rq, 0); 2186 update_cfs_shares(cfs_rq);
2127 2187
2128 raw_spin_unlock_irqrestore(&rq->lock, flags); 2188 raw_spin_unlock_irqrestore(&rq->lock, flags);
2129 2189
@@ -2610,7 +2670,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2610 * @this_cpu: Cpu for which load balance is currently performed. 2670 * @this_cpu: Cpu for which load balance is currently performed.
2611 * @idle: Idle status of this_cpu 2671 * @idle: Idle status of this_cpu
2612 * @load_idx: Load index of sched_domain of this_cpu for load calc. 2672 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2613 * @sd_idle: Idle status of the sched_domain containing group.
2614 * @local_group: Does group contain this_cpu. 2673 * @local_group: Does group contain this_cpu.
2615 * @cpus: Set of cpus considered for load balancing. 2674 * @cpus: Set of cpus considered for load balancing.
2616 * @balance: Should we balance. 2675 * @balance: Should we balance.
@@ -2618,7 +2677,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2618 */ 2677 */
2619static inline void update_sg_lb_stats(struct sched_domain *sd, 2678static inline void update_sg_lb_stats(struct sched_domain *sd,
2620 struct sched_group *group, int this_cpu, 2679 struct sched_group *group, int this_cpu,
2621 enum cpu_idle_type idle, int load_idx, int *sd_idle, 2680 enum cpu_idle_type idle, int load_idx,
2622 int local_group, const struct cpumask *cpus, 2681 int local_group, const struct cpumask *cpus,
2623 int *balance, struct sg_lb_stats *sgs) 2682 int *balance, struct sg_lb_stats *sgs)
2624{ 2683{
@@ -2638,9 +2697,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2638 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2697 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2639 struct rq *rq = cpu_rq(i); 2698 struct rq *rq = cpu_rq(i);
2640 2699
2641 if (*sd_idle && rq->nr_running)
2642 *sd_idle = 0;
2643
2644 /* Bias balancing toward cpus of our domain */ 2700 /* Bias balancing toward cpus of our domain */
2645 if (local_group) { 2701 if (local_group) {
2646 if (idle_cpu(i) && !first_idle_cpu) { 2702 if (idle_cpu(i) && !first_idle_cpu) {
@@ -2685,7 +2741,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2685 2741
2686 /* 2742 /*
2687 * Consider the group unbalanced when the imbalance is larger 2743 * Consider the group unbalanced when the imbalance is larger
2688 * than the average weight of two tasks. 2744 * than the average weight of a task.
2689 * 2745 *
2690 * APZ: with cgroup the avg task weight can vary wildly and 2746 * APZ: with cgroup the avg task weight can vary wildly and
2691 * might not be a suitable number - should we keep a 2747 * might not be a suitable number - should we keep a
@@ -2695,7 +2751,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2695 if (sgs->sum_nr_running) 2751 if (sgs->sum_nr_running)
2696 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2752 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2697 2753
2698 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) 2754 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
2699 sgs->group_imb = 1; 2755 sgs->group_imb = 1;
2700 2756
2701 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2757 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
@@ -2755,15 +2811,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
2755 * @sd: sched_domain whose statistics are to be updated. 2811 * @sd: sched_domain whose statistics are to be updated.
2756 * @this_cpu: Cpu for which load balance is currently performed. 2812 * @this_cpu: Cpu for which load balance is currently performed.
2757 * @idle: Idle status of this_cpu 2813 * @idle: Idle status of this_cpu
2758 * @sd_idle: Idle status of the sched_domain containing sg.
2759 * @cpus: Set of cpus considered for load balancing. 2814 * @cpus: Set of cpus considered for load balancing.
2760 * @balance: Should we balance. 2815 * @balance: Should we balance.
2761 * @sds: variable to hold the statistics for this sched_domain. 2816 * @sds: variable to hold the statistics for this sched_domain.
2762 */ 2817 */
2763static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 2818static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2764 enum cpu_idle_type idle, int *sd_idle, 2819 enum cpu_idle_type idle, const struct cpumask *cpus,
2765 const struct cpumask *cpus, int *balance, 2820 int *balance, struct sd_lb_stats *sds)
2766 struct sd_lb_stats *sds)
2767{ 2821{
2768 struct sched_domain *child = sd->child; 2822 struct sched_domain *child = sd->child;
2769 struct sched_group *sg = sd->groups; 2823 struct sched_group *sg = sd->groups;
@@ -2781,7 +2835,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2781 2835
2782 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 2836 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2783 memset(&sgs, 0, sizeof(sgs)); 2837 memset(&sgs, 0, sizeof(sgs));
2784 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, 2838 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
2785 local_group, cpus, balance, &sgs); 2839 local_group, cpus, balance, &sgs);
2786 2840
2787 if (local_group && !(*balance)) 2841 if (local_group && !(*balance))
@@ -3033,7 +3087,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3033 * @imbalance: Variable which stores amount of weighted load which should 3087 * @imbalance: Variable which stores amount of weighted load which should
3034 * be moved to restore balance/put a group to idle. 3088 * be moved to restore balance/put a group to idle.
3035 * @idle: The idle status of this_cpu. 3089 * @idle: The idle status of this_cpu.
3036 * @sd_idle: The idleness of sd
3037 * @cpus: The set of CPUs under consideration for load-balancing. 3090 * @cpus: The set of CPUs under consideration for load-balancing.
3038 * @balance: Pointer to a variable indicating if this_cpu 3091 * @balance: Pointer to a variable indicating if this_cpu
3039 * is the appropriate cpu to perform load balancing at this_level. 3092 * is the appropriate cpu to perform load balancing at this_level.
@@ -3046,7 +3099,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3046static struct sched_group * 3099static struct sched_group *
3047find_busiest_group(struct sched_domain *sd, int this_cpu, 3100find_busiest_group(struct sched_domain *sd, int this_cpu,
3048 unsigned long *imbalance, enum cpu_idle_type idle, 3101 unsigned long *imbalance, enum cpu_idle_type idle,
3049 int *sd_idle, const struct cpumask *cpus, int *balance) 3102 const struct cpumask *cpus, int *balance)
3050{ 3103{
3051 struct sd_lb_stats sds; 3104 struct sd_lb_stats sds;
3052 3105
@@ -3056,22 +3109,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3056 * Compute the various statistics relavent for load balancing at 3109 * Compute the various statistics relavent for load balancing at
3057 * this level. 3110 * this level.
3058 */ 3111 */
3059 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, 3112 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
3060 balance, &sds); 3113
3061 3114 /*
3062 /* Cases where imbalance does not exist from POV of this_cpu */ 3115 * this_cpu is not the appropriate cpu to perform load balancing at
3063 /* 1) this_cpu is not the appropriate cpu to perform load balancing 3116 * this level.
3064 * at this level.
3065 * 2) There is no busy sibling group to pull from.
3066 * 3) This group is the busiest group.
3067 * 4) This group is more busy than the avg busieness at this
3068 * sched_domain.
3069 * 5) The imbalance is within the specified limit.
3070 *
3071 * Note: when doing newidle balance, if the local group has excess
3072 * capacity (i.e. nr_running < group_capacity) and the busiest group
3073 * does not have any capacity, we force a load balance to pull tasks
3074 * to the local group. In this case, we skip past checks 3, 4 and 5.
3075 */ 3117 */
3076 if (!(*balance)) 3118 if (!(*balance))
3077 goto ret; 3119 goto ret;
@@ -3080,41 +3122,55 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3080 check_asym_packing(sd, &sds, this_cpu, imbalance)) 3122 check_asym_packing(sd, &sds, this_cpu, imbalance))
3081 return sds.busiest; 3123 return sds.busiest;
3082 3124
3125 /* There is no busy sibling group to pull tasks from */
3083 if (!sds.busiest || sds.busiest_nr_running == 0) 3126 if (!sds.busiest || sds.busiest_nr_running == 0)
3084 goto out_balanced; 3127 goto out_balanced;
3085 3128
3086 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 3129 /*
3130 * If the busiest group is imbalanced the below checks don't
3131 * work because they assumes all things are equal, which typically
3132 * isn't true due to cpus_allowed constraints and the like.
3133 */
3134 if (sds.group_imb)
3135 goto force_balance;
3136
3137 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
3087 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && 3138 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
3088 !sds.busiest_has_capacity) 3139 !sds.busiest_has_capacity)
3089 goto force_balance; 3140 goto force_balance;
3090 3141
3142 /*
3143 * If the local group is more busy than the selected busiest group
3144 * don't try and pull any tasks.
3145 */
3091 if (sds.this_load >= sds.max_load) 3146 if (sds.this_load >= sds.max_load)
3092 goto out_balanced; 3147 goto out_balanced;
3093 3148
3149 /*
3150 * Don't pull any tasks if this group is already above the domain
3151 * average load.
3152 */
3094 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; 3153 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3095
3096 if (sds.this_load >= sds.avg_load) 3154 if (sds.this_load >= sds.avg_load)
3097 goto out_balanced; 3155 goto out_balanced;
3098 3156
3099 /* 3157 if (idle == CPU_IDLE) {
3100 * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
3101 * And to check for busy balance use !idle_cpu instead of
3102 * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
3103 * even when they are idle.
3104 */
3105 if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
3106 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3107 goto out_balanced;
3108 } else {
3109 /* 3158 /*
3110 * This cpu is idle. If the busiest group load doesn't 3159 * This cpu is idle. If the busiest group load doesn't
3111 * have more tasks than the number of available cpu's and 3160 * have more tasks than the number of available cpu's and
3112 * there is no imbalance between this and busiest group 3161 * there is no imbalance between this and busiest group
3113 * wrt to idle cpu's, it is balanced. 3162 * wrt to idle cpu's, it is balanced.
3114 */ 3163 */
3115 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && 3164 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
3116 sds.busiest_nr_running <= sds.busiest_group_weight) 3165 sds.busiest_nr_running <= sds.busiest_group_weight)
3117 goto out_balanced; 3166 goto out_balanced;
3167 } else {
3168 /*
3169 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
3170 * imbalance_pct to be conservative.
3171 */
3172 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3173 goto out_balanced;
3118 } 3174 }
3119 3175
3120force_balance: 3176force_balance:
@@ -3193,7 +3249,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3193/* Working cpumask for load_balance and load_balance_newidle. */ 3249/* Working cpumask for load_balance and load_balance_newidle. */
3194static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 3250static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3195 3251
3196static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, 3252static int need_active_balance(struct sched_domain *sd, int idle,
3197 int busiest_cpu, int this_cpu) 3253 int busiest_cpu, int this_cpu)
3198{ 3254{
3199 if (idle == CPU_NEWLY_IDLE) { 3255 if (idle == CPU_NEWLY_IDLE) {
@@ -3225,10 +3281,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
3225 * move_tasks() will succeed. ld_moved will be true and this 3281 * move_tasks() will succeed. ld_moved will be true and this
3226 * active balance code will not be triggered. 3282 * active balance code will not be triggered.
3227 */ 3283 */
3228 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3229 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3230 return 0;
3231
3232 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) 3284 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3233 return 0; 3285 return 0;
3234 } 3286 }
@@ -3246,7 +3298,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3246 struct sched_domain *sd, enum cpu_idle_type idle, 3298 struct sched_domain *sd, enum cpu_idle_type idle,
3247 int *balance) 3299 int *balance)
3248{ 3300{
3249 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3301 int ld_moved, all_pinned = 0, active_balance = 0;
3250 struct sched_group *group; 3302 struct sched_group *group;
3251 unsigned long imbalance; 3303 unsigned long imbalance;
3252 struct rq *busiest; 3304 struct rq *busiest;
@@ -3255,20 +3307,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3255 3307
3256 cpumask_copy(cpus, cpu_active_mask); 3308 cpumask_copy(cpus, cpu_active_mask);
3257 3309
3258 /*
3259 * When power savings policy is enabled for the parent domain, idle
3260 * sibling can pick up load irrespective of busy siblings. In this case,
3261 * let the state of idle sibling percolate up as CPU_IDLE, instead of
3262 * portraying it as CPU_NOT_IDLE.
3263 */
3264 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3265 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3266 sd_idle = 1;
3267
3268 schedstat_inc(sd, lb_count[idle]); 3310 schedstat_inc(sd, lb_count[idle]);
3269 3311
3270redo: 3312redo:
3271 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3313 group = find_busiest_group(sd, this_cpu, &imbalance, idle,
3272 cpus, balance); 3314 cpus, balance);
3273 3315
3274 if (*balance == 0) 3316 if (*balance == 0)
@@ -3330,8 +3372,7 @@ redo:
3330 if (idle != CPU_NEWLY_IDLE) 3372 if (idle != CPU_NEWLY_IDLE)
3331 sd->nr_balance_failed++; 3373 sd->nr_balance_failed++;
3332 3374
3333 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3375 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
3334 this_cpu)) {
3335 raw_spin_lock_irqsave(&busiest->lock, flags); 3376 raw_spin_lock_irqsave(&busiest->lock, flags);
3336 3377
3337 /* don't kick the active_load_balance_cpu_stop, 3378 /* don't kick the active_load_balance_cpu_stop,
@@ -3386,10 +3427,6 @@ redo:
3386 sd->balance_interval *= 2; 3427 sd->balance_interval *= 2;
3387 } 3428 }
3388 3429
3389 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3390 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3391 ld_moved = -1;
3392
3393 goto out; 3430 goto out;
3394 3431
3395out_balanced: 3432out_balanced:
@@ -3403,11 +3440,7 @@ out_one_pinned:
3403 (sd->balance_interval < sd->max_interval)) 3440 (sd->balance_interval < sd->max_interval))
3404 sd->balance_interval *= 2; 3441 sd->balance_interval *= 2;
3405 3442
3406 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3443 ld_moved = 0;
3407 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3408 ld_moved = -1;
3409 else
3410 ld_moved = 0;
3411out: 3444out:
3412 return ld_moved; 3445 return ld_moved;
3413} 3446}
@@ -3831,8 +3864,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3831 if (load_balance(cpu, rq, sd, idle, &balance)) { 3864 if (load_balance(cpu, rq, sd, idle, &balance)) {
3832 /* 3865 /*
3833 * We've pulled tasks over so either we're no 3866 * We've pulled tasks over so either we're no
3834 * longer idle, or one of our SMT siblings is 3867 * longer idle.
3835 * not idle.
3836 */ 3868 */
3837 idle = CPU_NOT_IDLE; 3869 idle = CPU_NOT_IDLE;
3838 } 3870 }
@@ -4079,33 +4111,62 @@ static void task_fork_fair(struct task_struct *p)
4079 * Priority of the task has changed. Check to see if we preempt 4111 * Priority of the task has changed. Check to see if we preempt
4080 * the current task. 4112 * the current task.
4081 */ 4113 */
4082static void prio_changed_fair(struct rq *rq, struct task_struct *p, 4114static void
4083 int oldprio, int running) 4115prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
4084{ 4116{
4117 if (!p->se.on_rq)
4118 return;
4119
4085 /* 4120 /*
4086 * Reschedule if we are currently running on this runqueue and 4121 * Reschedule if we are currently running on this runqueue and
4087 * our priority decreased, or if we are not currently running on 4122 * our priority decreased, or if we are not currently running on
4088 * this runqueue and our priority is higher than the current's 4123 * this runqueue and our priority is higher than the current's
4089 */ 4124 */
4090 if (running) { 4125 if (rq->curr == p) {
4091 if (p->prio > oldprio) 4126 if (p->prio > oldprio)
4092 resched_task(rq->curr); 4127 resched_task(rq->curr);
4093 } else 4128 } else
4094 check_preempt_curr(rq, p, 0); 4129 check_preempt_curr(rq, p, 0);
4095} 4130}
4096 4131
4132static void switched_from_fair(struct rq *rq, struct task_struct *p)
4133{
4134 struct sched_entity *se = &p->se;
4135 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4136
4137 /*
4138 * Ensure the task's vruntime is normalized, so that when its
4139 * switched back to the fair class the enqueue_entity(.flags=0) will
4140 * do the right thing.
4141 *
4142 * If it was on_rq, then the dequeue_entity(.flags=0) will already
4143 * have normalized the vruntime, if it was !on_rq, then only when
4144 * the task is sleeping will it still have non-normalized vruntime.
4145 */
4146 if (!se->on_rq && p->state != TASK_RUNNING) {
4147 /*
4148 * Fix up our vruntime so that the current sleep doesn't
4149 * cause 'unlimited' sleep bonus.
4150 */
4151 place_entity(cfs_rq, se, 0);
4152 se->vruntime -= cfs_rq->min_vruntime;
4153 }
4154}
4155
4097/* 4156/*
4098 * We switched to the sched_fair class. 4157 * We switched to the sched_fair class.
4099 */ 4158 */
4100static void switched_to_fair(struct rq *rq, struct task_struct *p, 4159static void switched_to_fair(struct rq *rq, struct task_struct *p)
4101 int running)
4102{ 4160{
4161 if (!p->se.on_rq)
4162 return;
4163
4103 /* 4164 /*
4104 * We were most likely switched from sched_rt, so 4165 * We were most likely switched from sched_rt, so
4105 * kick off the schedule if running, otherwise just see 4166 * kick off the schedule if running, otherwise just see
4106 * if we can still preempt the current task. 4167 * if we can still preempt the current task.
4107 */ 4168 */
4108 if (running) 4169 if (rq->curr == p)
4109 resched_task(rq->curr); 4170 resched_task(rq->curr);
4110 else 4171 else
4111 check_preempt_curr(rq, p, 0); 4172 check_preempt_curr(rq, p, 0);
@@ -4171,6 +4232,7 @@ static const struct sched_class fair_sched_class = {
4171 .enqueue_task = enqueue_task_fair, 4232 .enqueue_task = enqueue_task_fair,
4172 .dequeue_task = dequeue_task_fair, 4233 .dequeue_task = dequeue_task_fair,
4173 .yield_task = yield_task_fair, 4234 .yield_task = yield_task_fair,
4235 .yield_to_task = yield_to_task_fair,
4174 4236
4175 .check_preempt_curr = check_preempt_wakeup, 4237 .check_preempt_curr = check_preempt_wakeup,
4176 4238
@@ -4191,6 +4253,7 @@ static const struct sched_class fair_sched_class = {
4191 .task_fork = task_fork_fair, 4253 .task_fork = task_fork_fair,
4192 4254
4193 .prio_changed = prio_changed_fair, 4255 .prio_changed = prio_changed_fair,
4256 .switched_from = switched_from_fair,
4194 .switched_to = switched_to_fair, 4257 .switched_to = switched_to_fair,
4195 4258
4196 .get_rr_interval = get_rr_interval_fair, 4259 .get_rr_interval = get_rr_interval_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87c..c82f26c1b7c3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq)
52{ 52{
53} 53}
54 54
55static void switched_to_idle(struct rq *rq, struct task_struct *p, 55static void switched_to_idle(struct rq *rq, struct task_struct *p)
56 int running)
57{ 56{
58 /* Can this actually happen?? */ 57 BUG();
59 if (running)
60 resched_task(rq->curr);
61 else
62 check_preempt_curr(rq, p, 0);
63} 58}
64 59
65static void prio_changed_idle(struct rq *rq, struct task_struct *p, 60static void
66 int oldprio, int running) 61prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
67{ 62{
68 /* This can happen for hot plug CPUS */ 63 BUG();
69
70 /*
71 * Reschedule if we are currently running on this runqueue and
72 * our priority decreased, or if we are not currently running on
73 * this runqueue and our priority is higher than the current's
74 */
75 if (running) {
76 if (p->prio > oldprio)
77 resched_task(rq->curr);
78 } else
79 check_preempt_curr(rq, p, 0);
80} 64}
81 65
82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 66static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 01f75a5f17af..db308cb08b75 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1599,8 +1599,7 @@ static void rq_offline_rt(struct rq *rq)
1599 * When switch from the rt queue, we bring ourselves to a position 1599 * When switch from the rt queue, we bring ourselves to a position
1600 * that we might want to pull RT tasks from other runqueues. 1600 * that we might want to pull RT tasks from other runqueues.
1601 */ 1601 */
1602static void switched_from_rt(struct rq *rq, struct task_struct *p, 1602static void switched_from_rt(struct rq *rq, struct task_struct *p)
1603 int running)
1604{ 1603{
1605 /* 1604 /*
1606 * If there are other RT tasks then we will reschedule 1605 * If there are other RT tasks then we will reschedule
@@ -1609,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1609 * we may need to handle the pulling of RT tasks 1608 * we may need to handle the pulling of RT tasks
1610 * now. 1609 * now.
1611 */ 1610 */
1612 if (!rq->rt.rt_nr_running) 1611 if (p->se.on_rq && !rq->rt.rt_nr_running)
1613 pull_rt_task(rq); 1612 pull_rt_task(rq);
1614} 1613}
1615 1614
@@ -1628,8 +1627,7 @@ static inline void init_sched_rt_class(void)
1628 * with RT tasks. In this case we try to push them off to 1627 * with RT tasks. In this case we try to push them off to
1629 * other runqueues. 1628 * other runqueues.
1630 */ 1629 */
1631static void switched_to_rt(struct rq *rq, struct task_struct *p, 1630static void switched_to_rt(struct rq *rq, struct task_struct *p)
1632 int running)
1633{ 1631{
1634 int check_resched = 1; 1632 int check_resched = 1;
1635 1633
@@ -1640,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1640 * If that current running task is also an RT task 1638 * If that current running task is also an RT task
1641 * then see if we can move to another run queue. 1639 * then see if we can move to another run queue.
1642 */ 1640 */
1643 if (!running) { 1641 if (p->se.on_rq && rq->curr != p) {
1644#ifdef CONFIG_SMP 1642#ifdef CONFIG_SMP
1645 if (rq->rt.overloaded && push_rt_task(rq) && 1643 if (rq->rt.overloaded && push_rt_task(rq) &&
1646 /* Don't resched if we changed runqueues */ 1644 /* Don't resched if we changed runqueues */
@@ -1656,10 +1654,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1656 * Priority of the task has changed. This may cause 1654 * Priority of the task has changed. This may cause
1657 * us to initiate a push or pull. 1655 * us to initiate a push or pull.
1658 */ 1656 */
1659static void prio_changed_rt(struct rq *rq, struct task_struct *p, 1657static void
1660 int oldprio, int running) 1658prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1661{ 1659{
1662 if (running) { 1660 if (!p->se.on_rq)
1661 return;
1662
1663 if (rq->curr == p) {
1663#ifdef CONFIG_SMP 1664#ifdef CONFIG_SMP
1664 /* 1665 /*
1665 * If our priority decreases while running, we 1666 * If our priority decreases while running, we
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47058c1..84ec9bcf82d9 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq)
59{ 59{
60} 60}
61 61
62static void switched_to_stop(struct rq *rq, struct task_struct *p, 62static void switched_to_stop(struct rq *rq, struct task_struct *p)
63 int running)
64{ 63{
65 BUG(); /* its impossible to change to this class */ 64 BUG(); /* its impossible to change to this class */
66} 65}
67 66
68static void prio_changed_stop(struct rq *rq, struct task_struct *p, 67static void
69 int oldprio, int running) 68prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
70{ 69{
71 BUG(); /* how!?, what priority? */ 70 BUG(); /* how!?, what priority? */
72} 71}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec388..56e5dec837f0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat);
54 54
55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; 55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
56 56
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
@@ -311,9 +311,21 @@ void irq_enter(void)
311} 311}
312 312
313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
314# define invoke_softirq() __do_softirq() 314static inline void invoke_softirq(void)
315{
316 if (!force_irqthreads)
317 __do_softirq();
318 else
319 wakeup_softirqd();
320}
315#else 321#else
316# define invoke_softirq() do_softirq() 322static inline void invoke_softirq(void)
323{
324 if (!force_irqthreads)
325 do_softirq();
326 else
327 wakeup_softirqd();
328}
317#endif 329#endif
318 330
319/* 331/*
@@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu)
721{ 733{
722 set_current_state(TASK_INTERRUPTIBLE); 734 set_current_state(TASK_INTERRUPTIBLE);
723 735
724 current->flags |= PF_KSOFTIRQD;
725 while (!kthread_should_stop()) { 736 while (!kthread_should_stop()) {
726 preempt_disable(); 737 preempt_disable();
727 if (!local_softirq_pending()) { 738 if (!local_softirq_pending()) {
@@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu)
738 don't process */ 749 don't process */
739 if (cpu_is_offline((long)__bind_cpu)) 750 if (cpu_is_offline((long)__bind_cpu))
740 goto wait_to_die; 751 goto wait_to_die;
741 do_softirq(); 752 local_irq_disable();
753 if (local_softirq_pending())
754 __do_softirq();
755 local_irq_enable();
742 preempt_enable_no_resched(); 756 preempt_enable_no_resched();
743 cond_resched(); 757 cond_resched();
744 preempt_disable(); 758 preempt_disable();
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c782fe9924c7..25cc41cd8f33 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -186,3 +186,8 @@ cond_syscall(sys_perf_event_open);
186/* fanotify! */ 186/* fanotify! */
187cond_syscall(sys_fanotify_init); 187cond_syscall(sys_fanotify_init);
188cond_syscall(sys_fanotify_mark); 188cond_syscall(sys_fanotify_mark);
189
190/* open by handle */
191cond_syscall(sys_name_to_handle_at);
192cond_syscall(sys_open_by_handle_at);
193cond_syscall(compat_sys_open_by_handle_at);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4eed0af5d144..51054fea5d99 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -361,20 +361,13 @@ static struct ctl_table kern_table[] = {
361 .mode = 0644, 361 .mode = 0644,
362 .proc_handler = sched_rt_handler, 362 .proc_handler = sched_rt_handler,
363 }, 363 },
364 {
365 .procname = "sched_compat_yield",
366 .data = &sysctl_sched_compat_yield,
367 .maxlen = sizeof(unsigned int),
368 .mode = 0644,
369 .proc_handler = proc_dointvec,
370 },
371#ifdef CONFIG_SCHED_AUTOGROUP 364#ifdef CONFIG_SCHED_AUTOGROUP
372 { 365 {
373 .procname = "sched_autogroup_enabled", 366 .procname = "sched_autogroup_enabled",
374 .data = &sysctl_sched_autogroup_enabled, 367 .data = &sysctl_sched_autogroup_enabled,
375 .maxlen = sizeof(unsigned int), 368 .maxlen = sizeof(unsigned int),
376 .mode = 0644, 369 .mode = 0644,
377 .proc_handler = proc_dointvec, 370 .proc_handler = proc_dointvec_minmax,
378 .extra1 = &zero, 371 .extra1 = &zero,
379 .extra2 = &one, 372 .extra2 = &one,
380 }, 373 },
@@ -948,7 +941,7 @@ static struct ctl_table kern_table[] = {
948 .data = &sysctl_perf_event_sample_rate, 941 .data = &sysctl_perf_event_sample_rate,
949 .maxlen = sizeof(sysctl_perf_event_sample_rate), 942 .maxlen = sizeof(sysctl_perf_event_sample_rate),
950 .mode = 0644, 943 .mode = 0644,
951 .proc_handler = proc_dointvec, 944 .proc_handler = perf_proc_update_handler,
952 }, 945 },
953#endif 946#endif
954#ifdef CONFIG_KMEMCHECK 947#ifdef CONFIG_KMEMCHECK
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b875bedf7c9a..3b8e028b9601 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1321 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1321 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1322{ 1322{
1323 const struct bin_table *table = NULL; 1323 const struct bin_table *table = NULL;
1324 struct nameidata nd;
1325 struct vfsmount *mnt; 1324 struct vfsmount *mnt;
1326 struct file *file; 1325 struct file *file;
1327 ssize_t result; 1326 ssize_t result;
1328 char *pathname; 1327 char *pathname;
1329 int flags; 1328 int flags;
1330 int acc_mode;
1331 1329
1332 pathname = sysctl_getname(name, nlen, &table); 1330 pathname = sysctl_getname(name, nlen, &table);
1333 result = PTR_ERR(pathname); 1331 result = PTR_ERR(pathname);
@@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
1337 /* How should the sysctl be accessed? */ 1335 /* How should the sysctl be accessed? */
1338 if (oldval && oldlen && newval && newlen) { 1336 if (oldval && oldlen && newval && newlen) {
1339 flags = O_RDWR; 1337 flags = O_RDWR;
1340 acc_mode = MAY_READ | MAY_WRITE;
1341 } else if (newval && newlen) { 1338 } else if (newval && newlen) {
1342 flags = O_WRONLY; 1339 flags = O_WRONLY;
1343 acc_mode = MAY_WRITE;
1344 } else if (oldval && oldlen) { 1340 } else if (oldval && oldlen) {
1345 flags = O_RDONLY; 1341 flags = O_RDONLY;
1346 acc_mode = MAY_READ;
1347 } else { 1342 } else {
1348 result = 0; 1343 result = 0;
1349 goto out_putname; 1344 goto out_putname;
1350 } 1345 }
1351 1346
1352 mnt = current->nsproxy->pid_ns->proc_mnt; 1347 mnt = current->nsproxy->pid_ns->proc_mnt;
1353 result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); 1348 file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
1354 if (result)
1355 goto out_putname;
1356
1357 result = may_open(&nd.path, acc_mode, flags);
1358 if (result)
1359 goto out_putpath;
1360
1361 file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
1362 result = PTR_ERR(file); 1349 result = PTR_ERR(file);
1363 if (IS_ERR(file)) 1350 if (IS_ERR(file))
1364 goto out_putname; 1351 goto out_putname;
@@ -1370,10 +1357,6 @@ out_putname:
1370 putname(pathname); 1357 putname(pathname);
1371out: 1358out:
1372 return result; 1359 return result;
1373
1374out_putpath:
1375 path_put(&nd.path);
1376 goto out_putname;
1377} 1360}
1378 1361
1379 1362
diff --git a/kernel/time.c b/kernel/time.c
index 32174359576f..8e8dc6d705c9 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
150 * various programs will get confused when the clock gets warped. 150 * various programs will get confused when the clock gets warped.
151 */ 151 */
152 152
153int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) 153int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
154{ 154{
155 static int firsttime = 1; 155 static int firsttime = 1;
156 int error = 0; 156 int error = 0;
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
645} 645}
646 646
647/** 647/**
648 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies 648 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
649 * 649 *
650 * @n: nsecs in u64 650 * @n: nsecs in u64
651 * 651 *
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) 657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years 658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
659 */ 659 */
660unsigned long nsecs_to_jiffies(u64 n) 660u64 nsecs_to_jiffies64(u64 n)
661{ 661{
662#if (NSEC_PER_SEC % HZ) == 0 662#if (NSEC_PER_SEC % HZ) == 0
663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ 663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n)
674#endif 674#endif
675} 675}
676 676
677#if (BITS_PER_LONG < 64) 677/**
678u64 get_jiffies_64(void) 678 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
679 *
680 * @n: nsecs in u64
681 *
682 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
683 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
684 * for scheduler, not for use in device drivers to calculate timeout value.
685 *
686 * note:
687 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
688 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
689 */
690unsigned long nsecs_to_jiffies(u64 n)
679{ 691{
680 unsigned long seq; 692 return (unsigned long)nsecs_to_jiffies64(n);
681 u64 ret;
682
683 do {
684 seq = read_seqbegin(&xtime_lock);
685 ret = jiffies_64;
686 } while (read_seqretry(&xtime_lock, seq));
687 return ret;
688} 693}
689EXPORT_SYMBOL(get_jiffies_64);
690#endif
691
692EXPORT_SYMBOL(jiffies);
693 694
694/* 695/*
695 * Add two timespec values and do a safety check for overflow. 696 * Add two timespec values and do a safety check for overflow.
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06c..b0425991e9ac 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,5 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
2obj-y += timeconv.o posix-clock.o
2 3
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d7395fdfb9f3..0d74b9ba90c8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,7 +18,6 @@
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 5404a8456909..b2fa506667c0 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -22,8 +22,11 @@
22************************************************************************/ 22************************************************************************/
23#include <linux/clocksource.h> 23#include <linux/clocksource.h>
24#include <linux/jiffies.h> 24#include <linux/jiffies.h>
25#include <linux/module.h>
25#include <linux/init.h> 26#include <linux/init.h>
26 27
28#include "tick-internal.h"
29
27/* The Jiffies based clocksource is the lowest common 30/* The Jiffies based clocksource is the lowest common
28 * denominator clock source which should function on 31 * denominator clock source which should function on
29 * all systems. It has the same coarse resolution as 32 * all systems. It has the same coarse resolution as
@@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = {
64 .shift = JIFFIES_SHIFT, 67 .shift = JIFFIES_SHIFT,
65}; 68};
66 69
70#if (BITS_PER_LONG < 64)
71u64 get_jiffies_64(void)
72{
73 unsigned long seq;
74 u64 ret;
75
76 do {
77 seq = read_seqbegin(&xtime_lock);
78 ret = jiffies_64;
79 } while (read_seqretry(&xtime_lock, seq));
80 return ret;
81}
82EXPORT_SYMBOL(get_jiffies_64);
83#endif
84
85EXPORT_SYMBOL(jiffies);
86
67static int __init init_jiffies_clocksource(void) 87static int __init init_jiffies_clocksource(void)
68{ 88{
69 return clocksource_register(&clocksource_jiffies); 89 return clocksource_register(&clocksource_jiffies);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5c00242fa921..5f1bb8e2008f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -16,6 +16,8 @@
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/module.h> 17#include <linux/module.h>
18 18
19#include "tick-internal.h"
20
19/* 21/*
20 * NTP timekeeping variables: 22 * NTP timekeeping variables:
21 */ 23 */
@@ -646,6 +648,17 @@ int do_adjtimex(struct timex *txc)
646 hrtimer_cancel(&leap_timer); 648 hrtimer_cancel(&leap_timer);
647 } 649 }
648 650
651 if (txc->modes & ADJ_SETOFFSET) {
652 struct timespec delta;
653 delta.tv_sec = txc->time.tv_sec;
654 delta.tv_nsec = txc->time.tv_usec;
655 if (!(txc->modes & ADJ_NANO))
656 delta.tv_nsec *= 1000;
657 result = timekeeping_inject_offset(&delta);
658 if (result)
659 return result;
660 }
661
649 getnstimeofday(&ts); 662 getnstimeofday(&ts);
650 663
651 write_seqlock_irq(&xtime_lock); 664 write_seqlock_irq(&xtime_lock);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 000000000000..25028dd4fa18
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,451 @@
1/*
2 * posix-clock.c - support for dynamic clock devices
3 *
4 * Copyright (C) 2010 OMICRON electronics GmbH
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20#include <linux/device.h>
21#include <linux/file.h>
22#include <linux/mutex.h>
23#include <linux/posix-clock.h>
24#include <linux/slab.h>
25#include <linux/syscalls.h>
26#include <linux/uaccess.h>
27
28static void delete_clock(struct kref *kref);
29
30/*
31 * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
32 */
33static struct posix_clock *get_posix_clock(struct file *fp)
34{
35 struct posix_clock *clk = fp->private_data;
36
37 mutex_lock(&clk->mutex);
38
39 if (!clk->zombie)
40 return clk;
41
42 mutex_unlock(&clk->mutex);
43
44 return NULL;
45}
46
47static void put_posix_clock(struct posix_clock *clk)
48{
49 mutex_unlock(&clk->mutex);
50}
51
52static ssize_t posix_clock_read(struct file *fp, char __user *buf,
53 size_t count, loff_t *ppos)
54{
55 struct posix_clock *clk = get_posix_clock(fp);
56 int err = -EINVAL;
57
58 if (!clk)
59 return -ENODEV;
60
61 if (clk->ops.read)
62 err = clk->ops.read(clk, fp->f_flags, buf, count);
63
64 put_posix_clock(clk);
65
66 return err;
67}
68
69static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
70{
71 struct posix_clock *clk = get_posix_clock(fp);
72 int result = 0;
73
74 if (!clk)
75 return -ENODEV;
76
77 if (clk->ops.poll)
78 result = clk->ops.poll(clk, fp, wait);
79
80 put_posix_clock(clk);
81
82 return result;
83}
84
85static int posix_clock_fasync(int fd, struct file *fp, int on)
86{
87 struct posix_clock *clk = get_posix_clock(fp);
88 int err = 0;
89
90 if (!clk)
91 return -ENODEV;
92
93 if (clk->ops.fasync)
94 err = clk->ops.fasync(clk, fd, fp, on);
95
96 put_posix_clock(clk);
97
98 return err;
99}
100
101static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
102{
103 struct posix_clock *clk = get_posix_clock(fp);
104 int err = -ENODEV;
105
106 if (!clk)
107 return -ENODEV;
108
109 if (clk->ops.mmap)
110 err = clk->ops.mmap(clk, vma);
111
112 put_posix_clock(clk);
113
114 return err;
115}
116
117static long posix_clock_ioctl(struct file *fp,
118 unsigned int cmd, unsigned long arg)
119{
120 struct posix_clock *clk = get_posix_clock(fp);
121 int err = -ENOTTY;
122
123 if (!clk)
124 return -ENODEV;
125
126 if (clk->ops.ioctl)
127 err = clk->ops.ioctl(clk, cmd, arg);
128
129 put_posix_clock(clk);
130
131 return err;
132}
133
134#ifdef CONFIG_COMPAT
135static long posix_clock_compat_ioctl(struct file *fp,
136 unsigned int cmd, unsigned long arg)
137{
138 struct posix_clock *clk = get_posix_clock(fp);
139 int err = -ENOTTY;
140
141 if (!clk)
142 return -ENODEV;
143
144 if (clk->ops.ioctl)
145 err = clk->ops.ioctl(clk, cmd, arg);
146
147 put_posix_clock(clk);
148
149 return err;
150}
151#endif
152
153static int posix_clock_open(struct inode *inode, struct file *fp)
154{
155 int err;
156 struct posix_clock *clk =
157 container_of(inode->i_cdev, struct posix_clock, cdev);
158
159 mutex_lock(&clk->mutex);
160
161 if (clk->zombie) {
162 err = -ENODEV;
163 goto out;
164 }
165 if (clk->ops.open)
166 err = clk->ops.open(clk, fp->f_mode);
167 else
168 err = 0;
169
170 if (!err) {
171 kref_get(&clk->kref);
172 fp->private_data = clk;
173 }
174out:
175 mutex_unlock(&clk->mutex);
176 return err;
177}
178
179static int posix_clock_release(struct inode *inode, struct file *fp)
180{
181 struct posix_clock *clk = fp->private_data;
182 int err = 0;
183
184 if (clk->ops.release)
185 err = clk->ops.release(clk);
186
187 kref_put(&clk->kref, delete_clock);
188
189 fp->private_data = NULL;
190
191 return err;
192}
193
194static const struct file_operations posix_clock_file_operations = {
195 .owner = THIS_MODULE,
196 .llseek = no_llseek,
197 .read = posix_clock_read,
198 .poll = posix_clock_poll,
199 .unlocked_ioctl = posix_clock_ioctl,
200 .open = posix_clock_open,
201 .release = posix_clock_release,
202 .fasync = posix_clock_fasync,
203 .mmap = posix_clock_mmap,
204#ifdef CONFIG_COMPAT
205 .compat_ioctl = posix_clock_compat_ioctl,
206#endif
207};
208
209int posix_clock_register(struct posix_clock *clk, dev_t devid)
210{
211 int err;
212
213 kref_init(&clk->kref);
214 mutex_init(&clk->mutex);
215
216 cdev_init(&clk->cdev, &posix_clock_file_operations);
217 clk->cdev.owner = clk->ops.owner;
218 err = cdev_add(&clk->cdev, devid, 1);
219 if (err)
220 goto no_cdev;
221
222 return err;
223no_cdev:
224 mutex_destroy(&clk->mutex);
225 return err;
226}
227EXPORT_SYMBOL_GPL(posix_clock_register);
228
229static void delete_clock(struct kref *kref)
230{
231 struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
232 mutex_destroy(&clk->mutex);
233 if (clk->release)
234 clk->release(clk);
235}
236
237void posix_clock_unregister(struct posix_clock *clk)
238{
239 cdev_del(&clk->cdev);
240
241 mutex_lock(&clk->mutex);
242 clk->zombie = true;
243 mutex_unlock(&clk->mutex);
244
245 kref_put(&clk->kref, delete_clock);
246}
247EXPORT_SYMBOL_GPL(posix_clock_unregister);
248
249struct posix_clock_desc {
250 struct file *fp;
251 struct posix_clock *clk;
252};
253
254static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
255{
256 struct file *fp = fget(CLOCKID_TO_FD(id));
257 int err = -EINVAL;
258
259 if (!fp)
260 return err;
261
262 if (fp->f_op->open != posix_clock_open || !fp->private_data)
263 goto out;
264
265 cd->fp = fp;
266 cd->clk = get_posix_clock(fp);
267
268 err = cd->clk ? 0 : -ENODEV;
269out:
270 if (err)
271 fput(fp);
272 return err;
273}
274
275static void put_clock_desc(struct posix_clock_desc *cd)
276{
277 put_posix_clock(cd->clk);
278 fput(cd->fp);
279}
280
281static int pc_clock_adjtime(clockid_t id, struct timex *tx)
282{
283 struct posix_clock_desc cd;
284 int err;
285
286 err = get_clock_desc(id, &cd);
287 if (err)
288 return err;
289
290 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
291 err = -EACCES;
292 goto out;
293 }
294
295 if (cd.clk->ops.clock_adjtime)
296 err = cd.clk->ops.clock_adjtime(cd.clk, tx);
297 else
298 err = -EOPNOTSUPP;
299out:
300 put_clock_desc(&cd);
301
302 return err;
303}
304
305static int pc_clock_gettime(clockid_t id, struct timespec *ts)
306{
307 struct posix_clock_desc cd;
308 int err;
309
310 err = get_clock_desc(id, &cd);
311 if (err)
312 return err;
313
314 if (cd.clk->ops.clock_gettime)
315 err = cd.clk->ops.clock_gettime(cd.clk, ts);
316 else
317 err = -EOPNOTSUPP;
318
319 put_clock_desc(&cd);
320
321 return err;
322}
323
324static int pc_clock_getres(clockid_t id, struct timespec *ts)
325{
326 struct posix_clock_desc cd;
327 int err;
328
329 err = get_clock_desc(id, &cd);
330 if (err)
331 return err;
332
333 if (cd.clk->ops.clock_getres)
334 err = cd.clk->ops.clock_getres(cd.clk, ts);
335 else
336 err = -EOPNOTSUPP;
337
338 put_clock_desc(&cd);
339
340 return err;
341}
342
343static int pc_clock_settime(clockid_t id, const struct timespec *ts)
344{
345 struct posix_clock_desc cd;
346 int err;
347
348 err = get_clock_desc(id, &cd);
349 if (err)
350 return err;
351
352 if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
353 err = -EACCES;
354 goto out;
355 }
356
357 if (cd.clk->ops.clock_settime)
358 err = cd.clk->ops.clock_settime(cd.clk, ts);
359 else
360 err = -EOPNOTSUPP;
361out:
362 put_clock_desc(&cd);
363
364 return err;
365}
366
367static int pc_timer_create(struct k_itimer *kit)
368{
369 clockid_t id = kit->it_clock;
370 struct posix_clock_desc cd;
371 int err;
372
373 err = get_clock_desc(id, &cd);
374 if (err)
375 return err;
376
377 if (cd.clk->ops.timer_create)
378 err = cd.clk->ops.timer_create(cd.clk, kit);
379 else
380 err = -EOPNOTSUPP;
381
382 put_clock_desc(&cd);
383
384 return err;
385}
386
387static int pc_timer_delete(struct k_itimer *kit)
388{
389 clockid_t id = kit->it_clock;
390 struct posix_clock_desc cd;
391 int err;
392
393 err = get_clock_desc(id, &cd);
394 if (err)
395 return err;
396
397 if (cd.clk->ops.timer_delete)
398 err = cd.clk->ops.timer_delete(cd.clk, kit);
399 else
400 err = -EOPNOTSUPP;
401
402 put_clock_desc(&cd);
403
404 return err;
405}
406
407static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
408{
409 clockid_t id = kit->it_clock;
410 struct posix_clock_desc cd;
411
412 if (get_clock_desc(id, &cd))
413 return;
414
415 if (cd.clk->ops.timer_gettime)
416 cd.clk->ops.timer_gettime(cd.clk, kit, ts);
417
418 put_clock_desc(&cd);
419}
420
421static int pc_timer_settime(struct k_itimer *kit, int flags,
422 struct itimerspec *ts, struct itimerspec *old)
423{
424 clockid_t id = kit->it_clock;
425 struct posix_clock_desc cd;
426 int err;
427
428 err = get_clock_desc(id, &cd);
429 if (err)
430 return err;
431
432 if (cd.clk->ops.timer_settime)
433 err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
434 else
435 err = -EOPNOTSUPP;
436
437 put_clock_desc(&cd);
438
439 return err;
440}
441
442struct k_clock clock_posix_dynamic = {
443 .clock_getres = pc_clock_getres,
444 .clock_set = pc_clock_settime,
445 .clock_get = pc_clock_gettime,
446 .clock_adj = pc_clock_adjtime,
447 .timer_create = pc_timer_create,
448 .timer_set = pc_timer_settime,
449 .timer_del = pc_timer_delete,
450 .timer_get = pc_timer_gettime,
451};
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index a3b5aff62606..da800ffa810c 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index ed228ef6f6b8..119528de8235 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include <asm/irq_regs.h> 22#include <asm/irq_regs.h>
24 23
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f65d3a723a64..1009b06d6f89 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
1/* 1/*
2 * tick internal variable and functions used by low/high res code 2 * tick internal variable and functions used by low/high res code
3 */ 3 */
4#include <linux/hrtimer.h>
5#include <linux/tick.h>
6
7#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
4 8
5#define TICK_DO_TIMER_NONE -1 9#define TICK_DO_TIMER_NONE -1
6#define TICK_DO_TIMER_BOOT -2 10#define TICK_DO_TIMER_BOOT -2
@@ -135,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
135{ 139{
136 return !(dev->features & CLOCK_EVT_FEAT_DUMMY); 140 return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
137} 141}
142
143#endif
144
145extern void do_timer(unsigned long ticks);
146extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 5cbc101f908b..2d04411a5f05 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,6 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/tick.h>
22 21
23#include "tick-internal.h" 22#include "tick-internal.h"
24 23
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c55ea2433471..d5097c44b407 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,7 +19,6 @@
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/tick.h>
23#include <linux/module.h> 22#include <linux/module.h>
24 23
25#include <asm/irq_regs.h> 24#include <asm/irq_regs.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d27c7562902c..3bd7e3d5c632 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
353 * 353 *
354 * Sets the time of day to the new time and update NTP and notify hrtimers 354 * Sets the time of day to the new time and update NTP and notify hrtimers
355 */ 355 */
356int do_settimeofday(struct timespec *tv) 356int do_settimeofday(const struct timespec *tv)
357{ 357{
358 struct timespec ts_delta; 358 struct timespec ts_delta;
359 unsigned long flags; 359 unsigned long flags;
@@ -387,6 +387,42 @@ int do_settimeofday(struct timespec *tv)
387 387
388EXPORT_SYMBOL(do_settimeofday); 388EXPORT_SYMBOL(do_settimeofday);
389 389
390
391/**
392 * timekeeping_inject_offset - Adds or subtracts from the current time.
393 * @tv: pointer to the timespec variable containing the offset
394 *
395 * Adds or subtracts an offset value from the current time.
396 */
397int timekeeping_inject_offset(struct timespec *ts)
398{
399 unsigned long flags;
400
401 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
402 return -EINVAL;
403
404 write_seqlock_irqsave(&xtime_lock, flags);
405
406 timekeeping_forward_now();
407
408 xtime = timespec_add(xtime, *ts);
409 wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
410
411 timekeeper.ntp_error = 0;
412 ntp_clear();
413
414 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
415 timekeeper.mult);
416
417 write_sequnlock_irqrestore(&xtime_lock, flags);
418
419 /* signal hrtimers about time change */
420 clock_was_set();
421
422 return 0;
423}
424EXPORT_SYMBOL(timekeeping_inject_offset);
425
390/** 426/**
391 * change_clocksource - Swaps clocksources if a new one is available 427 * change_clocksource - Swaps clocksources if a new one is available
392 * 428 *
@@ -779,7 +815,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
779 * 815 *
780 * Called from the timer interrupt, must hold a write on xtime_lock. 816 * Called from the timer interrupt, must hold a write on xtime_lock.
781 */ 817 */
782void update_wall_time(void) 818static void update_wall_time(void)
783{ 819{
784 struct clocksource *clock; 820 struct clocksource *clock;
785 cycle_t offset; 821 cycle_t offset;
@@ -871,7 +907,7 @@ void update_wall_time(void)
871 * getboottime - Return the real time of system boot. 907 * getboottime - Return the real time of system boot.
872 * @ts: pointer to the timespec to be set 908 * @ts: pointer to the timespec to be set
873 * 909 *
874 * Returns the time of day in a timespec. 910 * Returns the wall-time of boot in a timespec.
875 * 911 *
876 * This is based on the wall_to_monotonic offset and the total suspend 912 * This is based on the wall_to_monotonic offset and the total suspend
877 * time. Calls to settimeofday will affect the value returned (which 913 * time. Calls to settimeofday will affect the value returned (which
@@ -889,6 +925,55 @@ void getboottime(struct timespec *ts)
889} 925}
890EXPORT_SYMBOL_GPL(getboottime); 926EXPORT_SYMBOL_GPL(getboottime);
891 927
928
929/**
930 * get_monotonic_boottime - Returns monotonic time since boot
931 * @ts: pointer to the timespec to be set
932 *
933 * Returns the monotonic time since boot in a timespec.
934 *
935 * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
936 * includes the time spent in suspend.
937 */
938void get_monotonic_boottime(struct timespec *ts)
939{
940 struct timespec tomono, sleep;
941 unsigned int seq;
942 s64 nsecs;
943
944 WARN_ON(timekeeping_suspended);
945
946 do {
947 seq = read_seqbegin(&xtime_lock);
948 *ts = xtime;
949 tomono = wall_to_monotonic;
950 sleep = total_sleep_time;
951 nsecs = timekeeping_get_ns();
952
953 } while (read_seqretry(&xtime_lock, seq));
954
955 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
956 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
957}
958EXPORT_SYMBOL_GPL(get_monotonic_boottime);
959
960/**
961 * ktime_get_boottime - Returns monotonic time since boot in a ktime
962 *
963 * Returns the monotonic time since boot in a ktime
964 *
965 * This is similar to CLOCK_MONTONIC/ktime_get, but also
966 * includes the time spent in suspend.
967 */
968ktime_t ktime_get_boottime(void)
969{
970 struct timespec ts;
971
972 get_monotonic_boottime(&ts);
973 return timespec_to_ktime(ts);
974}
975EXPORT_SYMBOL_GPL(ktime_get_boottime);
976
892/** 977/**
893 * monotonic_to_bootbased - Convert the monotonic time to boot based. 978 * monotonic_to_bootbased - Convert the monotonic time to boot based.
894 * @ts: pointer to the timespec to be converted 979 * @ts: pointer to the timespec to be converted
@@ -910,11 +995,6 @@ struct timespec __current_kernel_time(void)
910 return xtime; 995 return xtime;
911} 996}
912 997
913struct timespec __get_wall_to_monotonic(void)
914{
915 return wall_to_monotonic;
916}
917
918struct timespec current_kernel_time(void) 998struct timespec current_kernel_time(void)
919{ 999{
920 struct timespec now; 1000 struct timespec now;
@@ -946,3 +1026,48 @@ struct timespec get_monotonic_coarse(void)
946 now.tv_nsec + mono.tv_nsec); 1026 now.tv_nsec + mono.tv_nsec);
947 return now; 1027 return now;
948} 1028}
1029
1030/*
1031 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1032 * without sampling the sequence number in xtime_lock.
1033 * jiffies is defined in the linker script...
1034 */
1035void do_timer(unsigned long ticks)
1036{
1037 jiffies_64 += ticks;
1038 update_wall_time();
1039 calc_global_load(ticks);
1040}
1041
1042/**
1043 * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
1044 * and sleep offsets.
1045 * @xtim: pointer to timespec to be set with xtime
1046 * @wtom: pointer to timespec to be set with wall_to_monotonic
1047 * @sleep: pointer to timespec to be set with time in suspend
1048 */
1049void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1050 struct timespec *wtom, struct timespec *sleep)
1051{
1052 unsigned long seq;
1053
1054 do {
1055 seq = read_seqbegin(&xtime_lock);
1056 *xtim = xtime;
1057 *wtom = wall_to_monotonic;
1058 *sleep = total_sleep_time;
1059 } while (read_seqretry(&xtime_lock, seq));
1060}
1061
1062/**
1063 * xtime_update() - advances the timekeeping infrastructure
1064 * @ticks: number of ticks, that have elapsed since the last call.
1065 *
1066 * Must be called with interrupts disabled.
1067 */
1068void xtime_update(unsigned long ticks)
1069{
1070 write_seqlock(&xtime_lock);
1071 do_timer(ticks);
1072 write_sequnlock(&xtime_lock);
1073}
diff --git a/kernel/timer.c b/kernel/timer.c
index d6459923d245..fd6198692b57 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
404 404
405static struct debug_obj_descr timer_debug_descr; 405static struct debug_obj_descr timer_debug_descr;
406 406
407static void *timer_debug_hint(void *addr)
408{
409 return ((struct timer_list *) addr)->function;
410}
411
407/* 412/*
408 * fixup_init is called when: 413 * fixup_init is called when:
409 * - an active object is initialized 414 * - an active object is initialized
@@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
477 482
478static struct debug_obj_descr timer_debug_descr = { 483static struct debug_obj_descr timer_debug_descr = {
479 .name = "timer_list", 484 .name = "timer_list",
485 .debug_hint = timer_debug_hint,
480 .fixup_init = timer_fixup_init, 486 .fixup_init = timer_fixup_init,
481 .fixup_activate = timer_fixup_activate, 487 .fixup_activate = timer_fixup_activate,
482 .fixup_free = timer_fixup_free, 488 .fixup_free = timer_fixup_free,
@@ -964,6 +970,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
964 * add_timer_on(). Upon exit the timer is not queued and the handler is 970 * add_timer_on(). Upon exit the timer is not queued and the handler is
965 * not running on any CPU. 971 * not running on any CPU.
966 * 972 *
973 * Note: You must not hold locks that are held in interrupt context
974 * while calling this function. Even if the lock has nothing to do
975 * with the timer in question. Here's why:
976 *
977 * CPU0 CPU1
978 * ---- ----
979 * <SOFTIRQ>
980 * call_timer_fn();
981 * base->running_timer = mytimer;
982 * spin_lock_irq(somelock);
983 * <IRQ>
984 * spin_lock(somelock);
985 * del_timer_sync(mytimer);
986 * while (base->running_timer == mytimer);
987 *
988 * Now del_timer_sync() will never return and never release somelock.
989 * The interrupt on the other CPU is waiting to grab somelock but
990 * it has interrupted the softirq that CPU0 is waiting to finish.
991 *
967 * The function returns whether it has deactivated a pending timer or not. 992 * The function returns whether it has deactivated a pending timer or not.
968 */ 993 */
969int del_timer_sync(struct timer_list *timer) 994int del_timer_sync(struct timer_list *timer)
@@ -971,6 +996,10 @@ int del_timer_sync(struct timer_list *timer)
971#ifdef CONFIG_LOCKDEP 996#ifdef CONFIG_LOCKDEP
972 unsigned long flags; 997 unsigned long flags;
973 998
999 /*
1000 * If lockdep gives a backtrace here, please reference
1001 * the synchronization rules above.
1002 */
974 local_irq_save(flags); 1003 local_irq_save(flags);
975 lock_map_acquire(&timer->lockdep_map); 1004 lock_map_acquire(&timer->lockdep_map);
976 lock_map_release(&timer->lockdep_map); 1005 lock_map_release(&timer->lockdep_map);
@@ -1295,19 +1324,6 @@ void run_local_timers(void)
1295 raise_softirq(TIMER_SOFTIRQ); 1324 raise_softirq(TIMER_SOFTIRQ);
1296} 1325}
1297 1326
1298/*
1299 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1300 * without sampling the sequence number in xtime_lock.
1301 * jiffies is defined in the linker script...
1302 */
1303
1304void do_timer(unsigned long ticks)
1305{
1306 jiffies_64 += ticks;
1307 update_wall_time();
1308 calc_global_load(ticks);
1309}
1310
1311#ifdef __ARCH_WANT_SYS_ALARM 1327#ifdef __ARCH_WANT_SYS_ALARM
1312 1328
1313/* 1329/*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3dadae83883..888b611897d3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3328,7 +3328,7 @@ static int start_graph_tracing(void)
3328 /* The cpu_boot init_task->ret_stack will never be freed */ 3328 /* The cpu_boot init_task->ret_stack will never be freed */
3329 for_each_online_cpu(cpu) { 3329 for_each_online_cpu(cpu) {
3330 if (!idle_task(cpu)->ret_stack) 3330 if (!idle_task(cpu)->ret_stack)
3331 ftrace_graph_init_task(idle_task(cpu)); 3331 ftrace_graph_init_idle_task(idle_task(cpu), cpu);
3332 } 3332 }
3333 3333
3334 do { 3334 do {
@@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void)
3418 mutex_unlock(&ftrace_lock); 3418 mutex_unlock(&ftrace_lock);
3419} 3419}
3420 3420
3421static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
3422
3423static void
3424graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
3425{
3426 atomic_set(&t->tracing_graph_pause, 0);
3427 atomic_set(&t->trace_overrun, 0);
3428 t->ftrace_timestamp = 0;
3429 /* make curr_ret_stack visable before we add the ret_stack */
3430 smp_wmb();
3431 t->ret_stack = ret_stack;
3432}
3433
3434/*
3435 * Allocate a return stack for the idle task. May be the first
3436 * time through, or it may be done by CPU hotplug online.
3437 */
3438void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
3439{
3440 t->curr_ret_stack = -1;
3441 /*
3442 * The idle task has no parent, it either has its own
3443 * stack or no stack at all.
3444 */
3445 if (t->ret_stack)
3446 WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
3447
3448 if (ftrace_graph_active) {
3449 struct ftrace_ret_stack *ret_stack;
3450
3451 ret_stack = per_cpu(idle_ret_stack, cpu);
3452 if (!ret_stack) {
3453 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
3454 * sizeof(struct ftrace_ret_stack),
3455 GFP_KERNEL);
3456 if (!ret_stack)
3457 return;
3458 per_cpu(idle_ret_stack, cpu) = ret_stack;
3459 }
3460 graph_init_task(t, ret_stack);
3461 }
3462}
3463
3421/* Allocate a return stack for newly created task */ 3464/* Allocate a return stack for newly created task */
3422void ftrace_graph_init_task(struct task_struct *t) 3465void ftrace_graph_init_task(struct task_struct *t)
3423{ 3466{
@@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t)
3433 GFP_KERNEL); 3476 GFP_KERNEL);
3434 if (!ret_stack) 3477 if (!ret_stack)
3435 return; 3478 return;
3436 atomic_set(&t->tracing_graph_pause, 0); 3479 graph_init_task(t, ret_stack);
3437 atomic_set(&t->trace_overrun, 0);
3438 t->ftrace_timestamp = 0;
3439 /* make curr_ret_stack visable before we add the ret_stack */
3440 smp_wmb();
3441 t->ret_stack = ret_stack;
3442 } 3480 }
3443} 3481}
3444 3482
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd1c35a4fbcc..db7b439d23ee 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
5 */ 5 */
6#include <linux/ring_buffer.h> 6#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h> 7#include <linux/trace_clock.h>
8#include <linux/ftrace_irq.h>
9#include <linux/spinlock.h> 8#include <linux/spinlock.h>
10#include <linux/debugfs.h> 9#include <linux/debugfs.h>
11#include <linux/uaccess.h> 10#include <linux/uaccess.h>
@@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1429} 1428}
1430EXPORT_SYMBOL_GPL(ring_buffer_resize); 1429EXPORT_SYMBOL_GPL(ring_buffer_resize);
1431 1430
1431void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
1432{
1433 mutex_lock(&buffer->mutex);
1434 if (val)
1435 buffer->flags |= RB_FL_OVERWRITE;
1436 else
1437 buffer->flags &= ~RB_FL_OVERWRITE;
1438 mutex_unlock(&buffer->mutex);
1439}
1440EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
1441
1432static inline void * 1442static inline void *
1433__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) 1443__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
1434{ 1444{
@@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2162 if (likely(ts >= cpu_buffer->write_stamp)) { 2172 if (likely(ts >= cpu_buffer->write_stamp)) {
2163 delta = diff; 2173 delta = diff;
2164 if (unlikely(test_time_stamp(delta))) { 2174 if (unlikely(test_time_stamp(delta))) {
2175 int local_clock_stable = 1;
2176#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2177 local_clock_stable = sched_clock_stable;
2178#endif
2165 WARN_ONCE(delta > (1ULL << 59), 2179 WARN_ONCE(delta > (1ULL << 59),
2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", 2180 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
2167 (unsigned long long)delta, 2181 (unsigned long long)delta,
2168 (unsigned long long)ts, 2182 (unsigned long long)ts,
2169 (unsigned long long)cpu_buffer->write_stamp); 2183 (unsigned long long)cpu_buffer->write_stamp,
2184 local_clock_stable ? "" :
2185 "If you just came from a suspend/resume,\n"
2186 "please switch to the trace global clock:\n"
2187 " echo global > /sys/kernel/debug/tracing/trace_clock\n");
2170 add_timestamp = 1; 2188 add_timestamp = 1;
2171 } 2189 }
2172 } 2190 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc53ecb80589..9541c27c1cf2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,8 +41,6 @@
41#include "trace.h" 41#include "trace.h"
42#include "trace_output.h" 42#include "trace_output.h"
43 43
44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
45
46/* 44/*
47 * On boot up, the ring buffer is set to the minimum size, so that 45 * On boot up, the ring buffer is set to the minimum size, so that
48 * we do not waste memory on systems that are not using tracing. 46 * we do not waste memory on systems that are not using tracing.
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
340/* trace_flags holds trace_options default values */ 338/* trace_flags holds trace_options default values */
341unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 339unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
342 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 340 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
343 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; 341 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
344 342
345static int trace_stop_count; 343static int trace_stop_count;
346static DEFINE_SPINLOCK(tracing_start_lock); 344static DEFINE_SPINLOCK(tracing_start_lock);
@@ -425,6 +423,7 @@ static const char *trace_options[] = {
425 "sleep-time", 423 "sleep-time",
426 "graph-time", 424 "graph-time",
427 "record-cmd", 425 "record-cmd",
426 "overwrite",
428 NULL 427 NULL
429}; 428};
430 429
@@ -780,6 +779,11 @@ __acquires(kernel_lock)
780 tracing_reset_online_cpus(tr); 779 tracing_reset_online_cpus(tr);
781 780
782 current_trace = type; 781 current_trace = type;
782
783 /* If we expanded the buffers, make sure the max is expanded too */
784 if (ring_buffer_expanded && type->use_max_tr)
785 ring_buffer_resize(max_tr.buffer, trace_buf_size);
786
783 /* the test is responsible for initializing and enabling */ 787 /* the test is responsible for initializing and enabling */
784 pr_info("Testing tracer %s: ", type->name); 788 pr_info("Testing tracer %s: ", type->name);
785 ret = type->selftest(type, tr); 789 ret = type->selftest(type, tr);
@@ -792,6 +796,10 @@ __acquires(kernel_lock)
792 /* Only reset on passing, to avoid touching corrupted buffers */ 796 /* Only reset on passing, to avoid touching corrupted buffers */
793 tracing_reset_online_cpus(tr); 797 tracing_reset_online_cpus(tr);
794 798
799 /* Shrink the max buffer again */
800 if (ring_buffer_expanded && type->use_max_tr)
801 ring_buffer_resize(max_tr.buffer, 1);
802
795 printk(KERN_CONT "PASSED\n"); 803 printk(KERN_CONT "PASSED\n");
796 } 804 }
797#endif 805#endif
@@ -1102,7 +1110,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1102 1110
1103 entry->preempt_count = pc & 0xff; 1111 entry->preempt_count = pc & 0xff;
1104 entry->pid = (tsk) ? tsk->pid : 0; 1112 entry->pid = (tsk) ? tsk->pid : 0;
1105 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
1106 entry->flags = 1113 entry->flags =
1107#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1114#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1108 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1115 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1749,10 +1756,9 @@ static void print_lat_help_header(struct seq_file *m)
1749 seq_puts(m, "# | / _----=> need-resched \n"); 1756 seq_puts(m, "# | / _----=> need-resched \n");
1750 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1757 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1751 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1758 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1752 seq_puts(m, "# |||| /_--=> lock-depth \n"); 1759 seq_puts(m, "# |||| / delay \n");
1753 seq_puts(m, "# |||||/ delay \n"); 1760 seq_puts(m, "# cmd pid ||||| time | caller \n");
1754 seq_puts(m, "# cmd pid |||||| time | caller \n"); 1761 seq_puts(m, "# \\ / ||||| \\ | / \n");
1755 seq_puts(m, "# \\ / |||||| \\ | / \n");
1756} 1762}
1757 1763
1758static void print_func_help_header(struct seq_file *m) 1764static void print_func_help_header(struct seq_file *m)
@@ -2529,6 +2535,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2529 2535
2530 if (mask == TRACE_ITER_RECORD_CMD) 2536 if (mask == TRACE_ITER_RECORD_CMD)
2531 trace_event_enable_cmd_record(enabled); 2537 trace_event_enable_cmd_record(enabled);
2538
2539 if (mask == TRACE_ITER_OVERWRITE)
2540 ring_buffer_change_overwrite(global_trace.buffer, enabled);
2532} 2541}
2533 2542
2534static ssize_t 2543static ssize_t
@@ -2710,6 +2719,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2710 2719
2711 mutex_lock(&trace_types_lock); 2720 mutex_lock(&trace_types_lock);
2712 if (tracer_enabled ^ val) { 2721 if (tracer_enabled ^ val) {
2722
2723 /* Only need to warn if this is used to change the state */
2724 WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
2725
2713 if (val) { 2726 if (val) {
2714 tracer_enabled = 1; 2727 tracer_enabled = 1;
2715 if (current_trace->start) 2728 if (current_trace->start)
@@ -4551,9 +4564,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4551__init static int tracer_alloc_buffers(void) 4564__init static int tracer_alloc_buffers(void)
4552{ 4565{
4553 int ring_buf_size; 4566 int ring_buf_size;
4567 enum ring_buffer_flags rb_flags;
4554 int i; 4568 int i;
4555 int ret = -ENOMEM; 4569 int ret = -ENOMEM;
4556 4570
4571
4557 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) 4572 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
4558 goto out; 4573 goto out;
4559 4574
@@ -4566,12 +4581,13 @@ __init static int tracer_alloc_buffers(void)
4566 else 4581 else
4567 ring_buf_size = 1; 4582 ring_buf_size = 1;
4568 4583
4584 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
4585
4569 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4586 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4570 cpumask_copy(tracing_cpumask, cpu_all_mask); 4587 cpumask_copy(tracing_cpumask, cpu_all_mask);
4571 4588
4572 /* TODO: make the number of buffers hot pluggable with CPUS */ 4589 /* TODO: make the number of buffers hot pluggable with CPUS */
4573 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4590 global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
4574 TRACE_BUFFER_FLAGS);
4575 if (!global_trace.buffer) { 4591 if (!global_trace.buffer) {
4576 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 4592 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
4577 WARN_ON(1); 4593 WARN_ON(1);
@@ -4581,7 +4597,7 @@ __init static int tracer_alloc_buffers(void)
4581 4597
4582 4598
4583#ifdef CONFIG_TRACER_MAX_TRACE 4599#ifdef CONFIG_TRACER_MAX_TRACE
4584 max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); 4600 max_tr.buffer = ring_buffer_alloc(1, rb_flags);
4585 if (!max_tr.buffer) { 4601 if (!max_tr.buffer) {
4586 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4602 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
4587 WARN_ON(1); 4603 WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9021f8c0c0c3..5e9dfc6286dd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
272 /* If you handled the flag setting, return 0 */ 272 /* If you handled the flag setting, return 0 */
273 int (*set_flag)(u32 old_flags, u32 bit, int set); 273 int (*set_flag)(u32 old_flags, u32 bit, int set);
274 struct tracer *next; 274 struct tracer *next;
275 int print_max;
276 struct tracer_flags *flags; 275 struct tracer_flags *flags;
276 int print_max;
277 int use_max_tr; 277 int use_max_tr;
278}; 278};
279 279
@@ -606,6 +606,7 @@ enum trace_iterator_flags {
606 TRACE_ITER_SLEEP_TIME = 0x40000, 606 TRACE_ITER_SLEEP_TIME = 0x40000,
607 TRACE_ITER_GRAPH_TIME = 0x80000, 607 TRACE_ITER_GRAPH_TIME = 0x80000,
608 TRACE_ITER_RECORD_CMD = 0x100000, 608 TRACE_ITER_RECORD_CMD = 0x100000,
609 TRACE_ITER_OVERWRITE = 0x200000,
609}; 610};
610 611
611/* 612/*
@@ -661,8 +662,10 @@ struct ftrace_event_field {
661}; 662};
662 663
663struct event_filter { 664struct event_filter {
664 int n_preds; 665 int n_preds; /* Number assigned */
665 struct filter_pred **preds; 666 int a_preds; /* allocated */
667 struct filter_pred *preds;
668 struct filter_pred *root;
666 char *filter_string; 669 char *filter_string;
667}; 670};
668 671
@@ -674,11 +677,23 @@ struct event_subsystem {
674 int nr_events; 677 int nr_events;
675}; 678};
676 679
680#define FILTER_PRED_INVALID ((unsigned short)-1)
681#define FILTER_PRED_IS_RIGHT (1 << 15)
682#define FILTER_PRED_FOLD (1 << 15)
683
684/*
685 * The max preds is the size of unsigned short with
686 * two flags at the MSBs. One bit is used for both the IS_RIGHT
687 * and FOLD flags. The other is reserved.
688 *
689 * 2^14 preds is way more than enough.
690 */
691#define MAX_FILTER_PRED 16384
692
677struct filter_pred; 693struct filter_pred;
678struct regex; 694struct regex;
679 695
680typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 696typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
681 int val1, int val2);
682 697
683typedef int (*regex_match_func)(char *str, struct regex *r, int len); 698typedef int (*regex_match_func)(char *str, struct regex *r, int len);
684 699
@@ -700,11 +715,23 @@ struct filter_pred {
700 filter_pred_fn_t fn; 715 filter_pred_fn_t fn;
701 u64 val; 716 u64 val;
702 struct regex regex; 717 struct regex regex;
703 char *field_name; 718 /*
719 * Leaf nodes use field_name, ops is used by AND and OR
720 * nodes. The field_name is always freed when freeing a pred.
721 * We can overload field_name for ops and have it freed
722 * as well.
723 */
724 union {
725 char *field_name;
726 unsigned short *ops;
727 };
704 int offset; 728 int offset;
705 int not; 729 int not;
706 int op; 730 int op;
707 int pop_n; 731 unsigned short index;
732 unsigned short parent;
733 unsigned short left;
734 unsigned short right;
708}; 735};
709 736
710extern struct list_head ftrace_common_fields; 737extern struct list_head ftrace_common_fields;
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 6cf223764be8..1516cb3ec549 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
109 */ 109 */
110#define FTRACE_CTX_FIELDS \ 110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \ 111 __field( unsigned int, prev_pid ) \
112 __field( unsigned int, next_pid ) \
113 __field( unsigned int, next_cpu ) \
112 __field( unsigned char, prev_prio ) \ 114 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \ 115 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \ 116 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \ 117 __field( unsigned char, next_state )
117 __field( unsigned int, next_cpu )
118 118
119FTRACE_ENTRY(context_switch, ctx_switch_entry, 119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120 120
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5f499e0438a4..e88f74fe1d4c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, lock_depth);
120 119
121 return ret; 120 return ret;
122} 121}
@@ -326,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
326{ 325{
327 return __ftrace_set_clr_event(NULL, system, event, set); 326 return __ftrace_set_clr_event(NULL, system, event, set);
328} 327}
328EXPORT_SYMBOL_GPL(trace_set_clr_event);
329 329
330/* 128 should be much more than enough */ 330/* 128 should be much more than enough */
331#define EVENT_BUF_SIZE 127 331#define EVENT_BUF_SIZE 127
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d40104b17f..3249b4f77ef0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,9 +123,13 @@ struct filter_parse_state {
123 } operand; 123 } operand;
124}; 124};
125 125
126struct pred_stack {
127 struct filter_pred **preds;
128 int index;
129};
130
126#define DEFINE_COMPARISON_PRED(type) \ 131#define DEFINE_COMPARISON_PRED(type) \
127static int filter_pred_##type(struct filter_pred *pred, void *event, \ 132static int filter_pred_##type(struct filter_pred *pred, void *event) \
128 int val1, int val2) \
129{ \ 133{ \
130 type *addr = (type *)(event + pred->offset); \ 134 type *addr = (type *)(event + pred->offset); \
131 type val = (type)pred->val; \ 135 type val = (type)pred->val; \
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \
152} 156}
153 157
154#define DEFINE_EQUALITY_PRED(size) \ 158#define DEFINE_EQUALITY_PRED(size) \
155static int filter_pred_##size(struct filter_pred *pred, void *event, \ 159static int filter_pred_##size(struct filter_pred *pred, void *event) \
156 int val1, int val2) \
157{ \ 160{ \
158 u##size *addr = (u##size *)(event + pred->offset); \ 161 u##size *addr = (u##size *)(event + pred->offset); \
159 u##size val = (u##size)pred->val; \ 162 u##size val = (u##size)pred->val; \
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32);
178DEFINE_EQUALITY_PRED(16); 181DEFINE_EQUALITY_PRED(16);
179DEFINE_EQUALITY_PRED(8); 182DEFINE_EQUALITY_PRED(8);
180 183
181static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
182 void *event __attribute((unused)),
183 int val1, int val2)
184{
185 return val1 && val2;
186}
187
188static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
189 void *event __attribute((unused)),
190 int val1, int val2)
191{
192 return val1 || val2;
193}
194
195/* Filter predicate for fixed sized arrays of characters */ 184/* Filter predicate for fixed sized arrays of characters */
196static int filter_pred_string(struct filter_pred *pred, void *event, 185static int filter_pred_string(struct filter_pred *pred, void *event)
197 int val1, int val2)
198{ 186{
199 char *addr = (char *)(event + pred->offset); 187 char *addr = (char *)(event + pred->offset);
200 int cmp, match; 188 int cmp, match;
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
207} 195}
208 196
209/* Filter predicate for char * pointers */ 197/* Filter predicate for char * pointers */
210static int filter_pred_pchar(struct filter_pred *pred, void *event, 198static int filter_pred_pchar(struct filter_pred *pred, void *event)
211 int val1, int val2)
212{ 199{
213 char **addr = (char **)(event + pred->offset); 200 char **addr = (char **)(event + pred->offset);
214 int cmp, match; 201 int cmp, match;
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
231 * and add it to the address of the entry, and at last we have 218 * and add it to the address of the entry, and at last we have
232 * the address of the string. 219 * the address of the string.
233 */ 220 */
234static int filter_pred_strloc(struct filter_pred *pred, void *event, 221static int filter_pred_strloc(struct filter_pred *pred, void *event)
235 int val1, int val2)
236{ 222{
237 u32 str_item = *(u32 *)(event + pred->offset); 223 u32 str_item = *(u32 *)(event + pred->offset);
238 int str_loc = str_item & 0xffff; 224 int str_loc = str_item & 0xffff;
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
247 return match; 233 return match;
248} 234}
249 235
250static int filter_pred_none(struct filter_pred *pred, void *event, 236static int filter_pred_none(struct filter_pred *pred, void *event)
251 int val1, int val2)
252{ 237{
253 return 0; 238 return 0;
254} 239}
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred)
377 pred->not ^= not; 362 pred->not ^= not;
378} 363}
379 364
365enum move_type {
366 MOVE_DOWN,
367 MOVE_UP_FROM_LEFT,
368 MOVE_UP_FROM_RIGHT
369};
370
371static struct filter_pred *
372get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
373 int index, enum move_type *move)
374{
375 if (pred->parent & FILTER_PRED_IS_RIGHT)
376 *move = MOVE_UP_FROM_RIGHT;
377 else
378 *move = MOVE_UP_FROM_LEFT;
379 pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
380
381 return pred;
382}
383
384/*
385 * A series of AND or ORs where found together. Instead of
386 * climbing up and down the tree branches, an array of the
387 * ops were made in order of checks. We can just move across
388 * the array and short circuit if needed.
389 */
390static int process_ops(struct filter_pred *preds,
391 struct filter_pred *op, void *rec)
392{
393 struct filter_pred *pred;
394 int type;
395 int match;
396 int i;
397
398 /*
399 * Micro-optimization: We set type to true if op
400 * is an OR and false otherwise (AND). Then we
401 * just need to test if the match is equal to
402 * the type, and if it is, we can short circuit the
403 * rest of the checks:
404 *
405 * if ((match && op->op == OP_OR) ||
406 * (!match && op->op == OP_AND))
407 * return match;
408 */
409 type = op->op == OP_OR;
410
411 for (i = 0; i < op->val; i++) {
412 pred = &preds[op->ops[i]];
413 match = pred->fn(pred, rec);
414 if (!!match == type)
415 return match;
416 }
417 return match;
418}
419
380/* return 1 if event matches, 0 otherwise (discard) */ 420/* return 1 if event matches, 0 otherwise (discard) */
381int filter_match_preds(struct event_filter *filter, void *rec) 421int filter_match_preds(struct event_filter *filter, void *rec)
382{ 422{
383 int match, top = 0, val1 = 0, val2 = 0; 423 int match = -1;
384 int stack[MAX_FILTER_PRED]; 424 enum move_type move = MOVE_DOWN;
425 struct filter_pred *preds;
385 struct filter_pred *pred; 426 struct filter_pred *pred;
386 int i; 427 struct filter_pred *root;
428 int n_preds;
429 int done = 0;
430
431 /* no filter is considered a match */
432 if (!filter)
433 return 1;
434
435 n_preds = filter->n_preds;
436
437 if (!n_preds)
438 return 1;
439
440 /*
441 * n_preds, root and filter->preds are protect with preemption disabled.
442 */
443 preds = rcu_dereference_sched(filter->preds);
444 root = rcu_dereference_sched(filter->root);
445 if (!root)
446 return 1;
447
448 pred = root;
387 449
388 for (i = 0; i < filter->n_preds; i++) { 450 /* match is currently meaningless */
389 pred = filter->preds[i]; 451 match = -1;
390 if (!pred->pop_n) { 452
391 match = pred->fn(pred, rec, val1, val2); 453 do {
392 stack[top++] = match; 454 switch (move) {
455 case MOVE_DOWN:
456 /* only AND and OR have children */
457 if (pred->left != FILTER_PRED_INVALID) {
458 /* If ops is set, then it was folded. */
459 if (!pred->ops) {
460 /* keep going to down the left side */
461 pred = &preds[pred->left];
462 continue;
463 }
464 /* We can treat folded ops as a leaf node */
465 match = process_ops(preds, pred, rec);
466 } else
467 match = pred->fn(pred, rec);
468 /* If this pred is the only pred */
469 if (pred == root)
470 break;
471 pred = get_pred_parent(pred, preds,
472 pred->parent, &move);
473 continue;
474 case MOVE_UP_FROM_LEFT:
475 /*
476 * Check for short circuits.
477 *
478 * Optimization: !!match == (pred->op == OP_OR)
479 * is the same as:
480 * if ((match && pred->op == OP_OR) ||
481 * (!match && pred->op == OP_AND))
482 */
483 if (!!match == (pred->op == OP_OR)) {
484 if (pred == root)
485 break;
486 pred = get_pred_parent(pred, preds,
487 pred->parent, &move);
488 continue;
489 }
490 /* now go down the right side of the tree. */
491 pred = &preds[pred->right];
492 move = MOVE_DOWN;
493 continue;
494 case MOVE_UP_FROM_RIGHT:
495 /* We finished this equation. */
496 if (pred == root)
497 break;
498 pred = get_pred_parent(pred, preds,
499 pred->parent, &move);
393 continue; 500 continue;
394 } 501 }
395 if (pred->pop_n > top) { 502 done = 1;
396 WARN_ON_ONCE(1); 503 } while (!done);
397 return 0;
398 }
399 val1 = stack[--top];
400 val2 = stack[--top];
401 match = pred->fn(pred, rec, val1, val2);
402 stack[top++] = match;
403 }
404 504
405 return stack[--top]; 505 return match;
406} 506}
407EXPORT_SYMBOL_GPL(filter_match_preds); 507EXPORT_SYMBOL_GPL(filter_match_preds);
408 508
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
414 514
415static void remove_filter_string(struct event_filter *filter) 515static void remove_filter_string(struct event_filter *filter)
416{ 516{
517 if (!filter)
518 return;
519
417 kfree(filter->filter_string); 520 kfree(filter->filter_string);
418 filter->filter_string = NULL; 521 filter->filter_string = NULL;
419} 522}
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
473 576
474void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) 577void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
475{ 578{
476 struct event_filter *filter = call->filter; 579 struct event_filter *filter;
477 580
478 mutex_lock(&event_mutex); 581 mutex_lock(&event_mutex);
582 filter = call->filter;
479 if (filter && filter->filter_string) 583 if (filter && filter->filter_string)
480 trace_seq_printf(s, "%s\n", filter->filter_string); 584 trace_seq_printf(s, "%s\n", filter->filter_string);
481 else 585 else
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
486void print_subsystem_event_filter(struct event_subsystem *system, 590void print_subsystem_event_filter(struct event_subsystem *system,
487 struct trace_seq *s) 591 struct trace_seq *s)
488{ 592{
489 struct event_filter *filter = system->filter; 593 struct event_filter *filter;
490 594
491 mutex_lock(&event_mutex); 595 mutex_lock(&event_mutex);
596 filter = system->filter;
492 if (filter && filter->filter_string) 597 if (filter && filter->filter_string)
493 trace_seq_printf(s, "%s\n", filter->filter_string); 598 trace_seq_printf(s, "%s\n", filter->filter_string);
494 else 599 else
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred)
539 pred->regex.len = 0; 644 pred->regex.len = 0;
540} 645}
541 646
542static int filter_set_pred(struct filter_pred *dest, 647static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
648{
649 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
650 if (!stack->preds)
651 return -ENOMEM;
652 stack->index = n_preds;
653 return 0;
654}
655
656static void __free_pred_stack(struct pred_stack *stack)
657{
658 kfree(stack->preds);
659 stack->index = 0;
660}
661
662static int __push_pred_stack(struct pred_stack *stack,
663 struct filter_pred *pred)
664{
665 int index = stack->index;
666
667 if (WARN_ON(index == 0))
668 return -ENOSPC;
669
670 stack->preds[--index] = pred;
671 stack->index = index;
672 return 0;
673}
674
675static struct filter_pred *
676__pop_pred_stack(struct pred_stack *stack)
677{
678 struct filter_pred *pred;
679 int index = stack->index;
680
681 pred = stack->preds[index++];
682 if (!pred)
683 return NULL;
684
685 stack->index = index;
686 return pred;
687}
688
689static int filter_set_pred(struct event_filter *filter,
690 int idx,
691 struct pred_stack *stack,
543 struct filter_pred *src, 692 struct filter_pred *src,
544 filter_pred_fn_t fn) 693 filter_pred_fn_t fn)
545{ 694{
695 struct filter_pred *dest = &filter->preds[idx];
696 struct filter_pred *left;
697 struct filter_pred *right;
698
546 *dest = *src; 699 *dest = *src;
547 if (src->field_name) { 700 if (src->field_name) {
548 dest->field_name = kstrdup(src->field_name, GFP_KERNEL); 701 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest,
550 return -ENOMEM; 703 return -ENOMEM;
551 } 704 }
552 dest->fn = fn; 705 dest->fn = fn;
706 dest->index = idx;
553 707
554 return 0; 708 if (dest->op == OP_OR || dest->op == OP_AND) {
709 right = __pop_pred_stack(stack);
710 left = __pop_pred_stack(stack);
711 if (!left || !right)
712 return -EINVAL;
713 /*
714 * If both children can be folded
715 * and they are the same op as this op or a leaf,
716 * then this op can be folded.
717 */
718 if (left->index & FILTER_PRED_FOLD &&
719 (left->op == dest->op ||
720 left->left == FILTER_PRED_INVALID) &&
721 right->index & FILTER_PRED_FOLD &&
722 (right->op == dest->op ||
723 right->left == FILTER_PRED_INVALID))
724 dest->index |= FILTER_PRED_FOLD;
725
726 dest->left = left->index & ~FILTER_PRED_FOLD;
727 dest->right = right->index & ~FILTER_PRED_FOLD;
728 left->parent = dest->index & ~FILTER_PRED_FOLD;
729 right->parent = dest->index | FILTER_PRED_IS_RIGHT;
730 } else {
731 /*
732 * Make dest->left invalid to be used as a quick
733 * way to know this is a leaf node.
734 */
735 dest->left = FILTER_PRED_INVALID;
736
737 /* All leafs allow folding the parent ops. */
738 dest->index |= FILTER_PRED_FOLD;
739 }
740
741 return __push_pred_stack(stack, dest);
555} 742}
556 743
557static void filter_disable_preds(struct ftrace_event_call *call) 744static void __free_preds(struct event_filter *filter)
558{ 745{
559 struct event_filter *filter = call->filter;
560 int i; 746 int i;
561 747
562 call->flags &= ~TRACE_EVENT_FL_FILTERED; 748 if (filter->preds) {
749 for (i = 0; i < filter->a_preds; i++)
750 kfree(filter->preds[i].field_name);
751 kfree(filter->preds);
752 filter->preds = NULL;
753 }
754 filter->a_preds = 0;
563 filter->n_preds = 0; 755 filter->n_preds = 0;
564
565 for (i = 0; i < MAX_FILTER_PRED; i++)
566 filter->preds[i]->fn = filter_pred_none;
567} 756}
568 757
569static void __free_preds(struct event_filter *filter) 758static void filter_disable(struct ftrace_event_call *call)
570{ 759{
571 int i; 760 call->flags &= ~TRACE_EVENT_FL_FILTERED;
761}
572 762
763static void __free_filter(struct event_filter *filter)
764{
573 if (!filter) 765 if (!filter)
574 return; 766 return;
575 767
576 for (i = 0; i < MAX_FILTER_PRED; i++) { 768 __free_preds(filter);
577 if (filter->preds[i])
578 filter_free_pred(filter->preds[i]);
579 }
580 kfree(filter->preds);
581 kfree(filter->filter_string); 769 kfree(filter->filter_string);
582 kfree(filter); 770 kfree(filter);
583} 771}
584 772
773/*
774 * Called when destroying the ftrace_event_call.
775 * The call is being freed, so we do not need to worry about
776 * the call being currently used. This is for module code removing
777 * the tracepoints from within it.
778 */
585void destroy_preds(struct ftrace_event_call *call) 779void destroy_preds(struct ftrace_event_call *call)
586{ 780{
587 __free_preds(call->filter); 781 __free_filter(call->filter);
588 call->filter = NULL; 782 call->filter = NULL;
589 call->flags &= ~TRACE_EVENT_FL_FILTERED;
590} 783}
591 784
592static struct event_filter *__alloc_preds(void) 785static struct event_filter *__alloc_filter(void)
593{ 786{
594 struct event_filter *filter; 787 struct event_filter *filter;
788
789 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
790 return filter;
791}
792
793static int __alloc_preds(struct event_filter *filter, int n_preds)
794{
595 struct filter_pred *pred; 795 struct filter_pred *pred;
596 int i; 796 int i;
597 797
598 filter = kzalloc(sizeof(*filter), GFP_KERNEL); 798 if (filter->preds)
599 if (!filter) 799 __free_preds(filter);
600 return ERR_PTR(-ENOMEM);
601 800
602 filter->n_preds = 0; 801 filter->preds =
802 kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
603 803
604 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
605 if (!filter->preds) 804 if (!filter->preds)
606 goto oom; 805 return -ENOMEM;
607 806
608 for (i = 0; i < MAX_FILTER_PRED; i++) { 807 filter->a_preds = n_preds;
609 pred = kzalloc(sizeof(*pred), GFP_KERNEL); 808 filter->n_preds = 0;
610 if (!pred) 809
611 goto oom; 810 for (i = 0; i < n_preds; i++) {
811 pred = &filter->preds[i];
612 pred->fn = filter_pred_none; 812 pred->fn = filter_pred_none;
613 filter->preds[i] = pred;
614 } 813 }
615 814
616 return filter;
617
618oom:
619 __free_preds(filter);
620 return ERR_PTR(-ENOMEM);
621}
622
623static int init_preds(struct ftrace_event_call *call)
624{
625 if (call->filter)
626 return 0;
627
628 call->flags &= ~TRACE_EVENT_FL_FILTERED;
629 call->filter = __alloc_preds();
630 if (IS_ERR(call->filter))
631 return PTR_ERR(call->filter);
632
633 return 0; 815 return 0;
634} 816}
635 817
636static int init_subsystem_preds(struct event_subsystem *system) 818static void filter_free_subsystem_preds(struct event_subsystem *system)
637{ 819{
638 struct ftrace_event_call *call; 820 struct ftrace_event_call *call;
639 int err;
640 821
641 list_for_each_entry(call, &ftrace_events, list) { 822 list_for_each_entry(call, &ftrace_events, list) {
642 if (strcmp(call->class->system, system->name) != 0) 823 if (strcmp(call->class->system, system->name) != 0)
643 continue; 824 continue;
644 825
645 err = init_preds(call); 826 filter_disable(call);
646 if (err) 827 remove_filter_string(call->filter);
647 return err;
648 } 828 }
649
650 return 0;
651} 829}
652 830
653static void filter_free_subsystem_preds(struct event_subsystem *system) 831static void filter_free_subsystem_filters(struct event_subsystem *system)
654{ 832{
655 struct ftrace_event_call *call; 833 struct ftrace_event_call *call;
656 834
657 list_for_each_entry(call, &ftrace_events, list) { 835 list_for_each_entry(call, &ftrace_events, list) {
658 if (strcmp(call->class->system, system->name) != 0) 836 if (strcmp(call->class->system, system->name) != 0)
659 continue; 837 continue;
660 838 __free_filter(call->filter);
661 filter_disable_preds(call); 839 call->filter = NULL;
662 remove_filter_string(call->filter);
663 } 840 }
664} 841}
665 842
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
667 struct ftrace_event_call *call, 844 struct ftrace_event_call *call,
668 struct event_filter *filter, 845 struct event_filter *filter,
669 struct filter_pred *pred, 846 struct filter_pred *pred,
847 struct pred_stack *stack,
670 filter_pred_fn_t fn) 848 filter_pred_fn_t fn)
671{ 849{
672 int idx, err; 850 int idx, err;
673 851
674 if (filter->n_preds == MAX_FILTER_PRED) { 852 if (WARN_ON(filter->n_preds == filter->a_preds)) {
675 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 853 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
676 return -ENOSPC; 854 return -ENOSPC;
677 } 855 }
678 856
679 idx = filter->n_preds; 857 idx = filter->n_preds;
680 filter_clear_pred(filter->preds[idx]); 858 filter_clear_pred(&filter->preds[idx]);
681 err = filter_set_pred(filter->preds[idx], pred, fn); 859 err = filter_set_pred(filter, idx, stack, pred, fn);
682 if (err) 860 if (err)
683 return err; 861 return err;
684 862
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
763 struct ftrace_event_call *call, 941 struct ftrace_event_call *call,
764 struct event_filter *filter, 942 struct event_filter *filter,
765 struct filter_pred *pred, 943 struct filter_pred *pred,
944 struct pred_stack *stack,
766 bool dry_run) 945 bool dry_run)
767{ 946{
768 struct ftrace_event_field *field; 947 struct ftrace_event_field *field;
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
770 unsigned long long val; 949 unsigned long long val;
771 int ret; 950 int ret;
772 951
773 pred->fn = filter_pred_none; 952 fn = pred->fn = filter_pred_none;
774 953
775 if (pred->op == OP_AND) { 954 if (pred->op == OP_AND)
776 pred->pop_n = 2;
777 fn = filter_pred_and;
778 goto add_pred_fn; 955 goto add_pred_fn;
779 } else if (pred->op == OP_OR) { 956 else if (pred->op == OP_OR)
780 pred->pop_n = 2;
781 fn = filter_pred_or;
782 goto add_pred_fn; 957 goto add_pred_fn;
783 }
784 958
785 field = find_event_field(call, pred->field_name); 959 field = find_event_field(call, pred->field_name);
786 if (!field) { 960 if (!field) {
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
829 1003
830add_pred_fn: 1004add_pred_fn:
831 if (!dry_run) 1005 if (!dry_run)
832 return filter_add_pred_fn(ps, call, filter, pred, fn); 1006 return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
833 return 0; 1007 return 0;
834} 1008}
835 1009
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps)
1187 return 0; 1361 return 0;
1188} 1362}
1189 1363
1364static int count_preds(struct filter_parse_state *ps)
1365{
1366 struct postfix_elt *elt;
1367 int n_preds = 0;
1368
1369 list_for_each_entry(elt, &ps->postfix, list) {
1370 if (elt->op == OP_NONE)
1371 continue;
1372 n_preds++;
1373 }
1374
1375 return n_preds;
1376}
1377
1378/*
1379 * The tree is walked at filtering of an event. If the tree is not correctly
1380 * built, it may cause an infinite loop. Check here that the tree does
1381 * indeed terminate.
1382 */
1383static int check_pred_tree(struct event_filter *filter,
1384 struct filter_pred *root)
1385{
1386 struct filter_pred *preds;
1387 struct filter_pred *pred;
1388 enum move_type move = MOVE_DOWN;
1389 int count = 0;
1390 int done = 0;
1391 int max;
1392
1393 /*
1394 * The max that we can hit a node is three times.
1395 * Once going down, once coming up from left, and
1396 * once coming up from right. This is more than enough
1397 * since leafs are only hit a single time.
1398 */
1399 max = 3 * filter->n_preds;
1400
1401 preds = filter->preds;
1402 if (!preds)
1403 return -EINVAL;
1404 pred = root;
1405
1406 do {
1407 if (WARN_ON(count++ > max))
1408 return -EINVAL;
1409
1410 switch (move) {
1411 case MOVE_DOWN:
1412 if (pred->left != FILTER_PRED_INVALID) {
1413 pred = &preds[pred->left];
1414 continue;
1415 }
1416 /* A leaf at the root is just a leaf in the tree */
1417 if (pred == root)
1418 break;
1419 pred = get_pred_parent(pred, preds,
1420 pred->parent, &move);
1421 continue;
1422 case MOVE_UP_FROM_LEFT:
1423 pred = &preds[pred->right];
1424 move = MOVE_DOWN;
1425 continue;
1426 case MOVE_UP_FROM_RIGHT:
1427 if (pred == root)
1428 break;
1429 pred = get_pred_parent(pred, preds,
1430 pred->parent, &move);
1431 continue;
1432 }
1433 done = 1;
1434 } while (!done);
1435
1436 /* We are fine. */
1437 return 0;
1438}
1439
1440static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
1441{
1442 struct filter_pred *pred;
1443 enum move_type move = MOVE_DOWN;
1444 int count = 0;
1445 int done = 0;
1446
1447 pred = root;
1448
1449 do {
1450 switch (move) {
1451 case MOVE_DOWN:
1452 if (pred->left != FILTER_PRED_INVALID) {
1453 pred = &preds[pred->left];
1454 continue;
1455 }
1456 /* A leaf at the root is just a leaf in the tree */
1457 if (pred == root)
1458 return 1;
1459 count++;
1460 pred = get_pred_parent(pred, preds,
1461 pred->parent, &move);
1462 continue;
1463 case MOVE_UP_FROM_LEFT:
1464 pred = &preds[pred->right];
1465 move = MOVE_DOWN;
1466 continue;
1467 case MOVE_UP_FROM_RIGHT:
1468 if (pred == root)
1469 break;
1470 pred = get_pred_parent(pred, preds,
1471 pred->parent, &move);
1472 continue;
1473 }
1474 done = 1;
1475 } while (!done);
1476
1477 return count;
1478}
1479
1480static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1481{
1482 struct filter_pred *pred;
1483 enum move_type move = MOVE_DOWN;
1484 int count = 0;
1485 int children;
1486 int done = 0;
1487
1488 /* No need to keep the fold flag */
1489 root->index &= ~FILTER_PRED_FOLD;
1490
1491 /* If the root is a leaf then do nothing */
1492 if (root->left == FILTER_PRED_INVALID)
1493 return 0;
1494
1495 /* count the children */
1496 children = count_leafs(preds, &preds[root->left]);
1497 children += count_leafs(preds, &preds[root->right]);
1498
1499 root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
1500 if (!root->ops)
1501 return -ENOMEM;
1502
1503 root->val = children;
1504
1505 pred = root;
1506 do {
1507 switch (move) {
1508 case MOVE_DOWN:
1509 if (pred->left != FILTER_PRED_INVALID) {
1510 pred = &preds[pred->left];
1511 continue;
1512 }
1513 if (WARN_ON(count == children))
1514 return -EINVAL;
1515 pred->index &= ~FILTER_PRED_FOLD;
1516 root->ops[count++] = pred->index;
1517 pred = get_pred_parent(pred, preds,
1518 pred->parent, &move);
1519 continue;
1520 case MOVE_UP_FROM_LEFT:
1521 pred = &preds[pred->right];
1522 move = MOVE_DOWN;
1523 continue;
1524 case MOVE_UP_FROM_RIGHT:
1525 if (pred == root)
1526 break;
1527 pred = get_pred_parent(pred, preds,
1528 pred->parent, &move);
1529 continue;
1530 }
1531 done = 1;
1532 } while (!done);
1533
1534 return 0;
1535}
1536
1537/*
1538 * To optimize the processing of the ops, if we have several "ors" or
1539 * "ands" together, we can put them in an array and process them all
1540 * together speeding up the filter logic.
1541 */
1542static int fold_pred_tree(struct event_filter *filter,
1543 struct filter_pred *root)
1544{
1545 struct filter_pred *preds;
1546 struct filter_pred *pred;
1547 enum move_type move = MOVE_DOWN;
1548 int done = 0;
1549 int err;
1550
1551 preds = filter->preds;
1552 if (!preds)
1553 return -EINVAL;
1554 pred = root;
1555
1556 do {
1557 switch (move) {
1558 case MOVE_DOWN:
1559 if (pred->index & FILTER_PRED_FOLD) {
1560 err = fold_pred(preds, pred);
1561 if (err)
1562 return err;
1563 /* Folded nodes are like leafs */
1564 } else if (pred->left != FILTER_PRED_INVALID) {
1565 pred = &preds[pred->left];
1566 continue;
1567 }
1568
1569 /* A leaf at the root is just a leaf in the tree */
1570 if (pred == root)
1571 break;
1572 pred = get_pred_parent(pred, preds,
1573 pred->parent, &move);
1574 continue;
1575 case MOVE_UP_FROM_LEFT:
1576 pred = &preds[pred->right];
1577 move = MOVE_DOWN;
1578 continue;
1579 case MOVE_UP_FROM_RIGHT:
1580 if (pred == root)
1581 break;
1582 pred = get_pred_parent(pred, preds,
1583 pred->parent, &move);
1584 continue;
1585 }
1586 done = 1;
1587 } while (!done);
1588
1589 return 0;
1590}
1591
1190static int replace_preds(struct ftrace_event_call *call, 1592static int replace_preds(struct ftrace_event_call *call,
1191 struct event_filter *filter, 1593 struct event_filter *filter,
1192 struct filter_parse_state *ps, 1594 struct filter_parse_state *ps,
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call,
1195{ 1597{
1196 char *operand1 = NULL, *operand2 = NULL; 1598 char *operand1 = NULL, *operand2 = NULL;
1197 struct filter_pred *pred; 1599 struct filter_pred *pred;
1600 struct filter_pred *root;
1198 struct postfix_elt *elt; 1601 struct postfix_elt *elt;
1602 struct pred_stack stack = { }; /* init to NULL */
1199 int err; 1603 int err;
1200 int n_preds = 0; 1604 int n_preds = 0;
1201 1605
1606 n_preds = count_preds(ps);
1607 if (n_preds >= MAX_FILTER_PRED) {
1608 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1609 return -ENOSPC;
1610 }
1611
1202 err = check_preds(ps); 1612 err = check_preds(ps);
1203 if (err) 1613 if (err)
1204 return err; 1614 return err;
1205 1615
1616 if (!dry_run) {
1617 err = __alloc_pred_stack(&stack, n_preds);
1618 if (err)
1619 return err;
1620 err = __alloc_preds(filter, n_preds);
1621 if (err)
1622 goto fail;
1623 }
1624
1625 n_preds = 0;
1206 list_for_each_entry(elt, &ps->postfix, list) { 1626 list_for_each_entry(elt, &ps->postfix, list) {
1207 if (elt->op == OP_NONE) { 1627 if (elt->op == OP_NONE) {
1208 if (!operand1) 1628 if (!operand1)
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call,
1211 operand2 = elt->operand; 1631 operand2 = elt->operand;
1212 else { 1632 else {
1213 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); 1633 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
1214 return -EINVAL; 1634 err = -EINVAL;
1635 goto fail;
1215 } 1636 }
1216 continue; 1637 continue;
1217 } 1638 }
1218 1639
1219 if (n_preds++ == MAX_FILTER_PRED) { 1640 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
1220 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); 1641 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1221 return -ENOSPC; 1642 err = -ENOSPC;
1643 goto fail;
1222 } 1644 }
1223 1645
1224 if (elt->op == OP_AND || elt->op == OP_OR) { 1646 if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call,
1228 1650
1229 if (!operand1 || !operand2) { 1651 if (!operand1 || !operand2) {
1230 parse_error(ps, FILT_ERR_MISSING_FIELD, 0); 1652 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1231 return -EINVAL; 1653 err = -EINVAL;
1654 goto fail;
1232 } 1655 }
1233 1656
1234 pred = create_pred(elt->op, operand1, operand2); 1657 pred = create_pred(elt->op, operand1, operand2);
1235add_pred: 1658add_pred:
1236 if (!pred) 1659 if (!pred) {
1237 return -ENOMEM; 1660 err = -ENOMEM;
1238 err = filter_add_pred(ps, call, filter, pred, dry_run); 1661 goto fail;
1662 }
1663 err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
1239 filter_free_pred(pred); 1664 filter_free_pred(pred);
1240 if (err) 1665 if (err)
1241 return err; 1666 goto fail;
1242 1667
1243 operand1 = operand2 = NULL; 1668 operand1 = operand2 = NULL;
1244 } 1669 }
1245 1670
1246 return 0; 1671 if (!dry_run) {
1672 /* We should have one item left on the stack */
1673 pred = __pop_pred_stack(&stack);
1674 if (!pred)
1675 return -EINVAL;
1676 /* This item is where we start from in matching */
1677 root = pred;
1678 /* Make sure the stack is empty */
1679 pred = __pop_pred_stack(&stack);
1680 if (WARN_ON(pred)) {
1681 err = -EINVAL;
1682 filter->root = NULL;
1683 goto fail;
1684 }
1685 err = check_pred_tree(filter, root);
1686 if (err)
1687 goto fail;
1688
1689 /* Optimize the tree */
1690 err = fold_pred_tree(filter, root);
1691 if (err)
1692 goto fail;
1693
1694 /* We don't set root until we know it works */
1695 barrier();
1696 filter->root = root;
1697 }
1698
1699 err = 0;
1700fail:
1701 __free_pred_stack(&stack);
1702 return err;
1247} 1703}
1248 1704
1705struct filter_list {
1706 struct list_head list;
1707 struct event_filter *filter;
1708};
1709
1249static int replace_system_preds(struct event_subsystem *system, 1710static int replace_system_preds(struct event_subsystem *system,
1250 struct filter_parse_state *ps, 1711 struct filter_parse_state *ps,
1251 char *filter_string) 1712 char *filter_string)
1252{ 1713{
1253 struct ftrace_event_call *call; 1714 struct ftrace_event_call *call;
1715 struct filter_list *filter_item;
1716 struct filter_list *tmp;
1717 LIST_HEAD(filter_list);
1254 bool fail = true; 1718 bool fail = true;
1255 int err; 1719 int err;
1256 1720
1257 list_for_each_entry(call, &ftrace_events, list) { 1721 list_for_each_entry(call, &ftrace_events, list) {
1258 struct event_filter *filter = call->filter;
1259 1722
1260 if (strcmp(call->class->system, system->name) != 0) 1723 if (strcmp(call->class->system, system->name) != 0)
1261 continue; 1724 continue;
1262 1725
1263 /* try to see if the filter can be applied */ 1726 /*
1264 err = replace_preds(call, filter, ps, filter_string, true); 1727 * Try to see if the filter can be applied
1728 * (filter arg is ignored on dry_run)
1729 */
1730 err = replace_preds(call, NULL, ps, filter_string, true);
1265 if (err) 1731 if (err)
1732 goto fail;
1733 }
1734
1735 list_for_each_entry(call, &ftrace_events, list) {
1736 struct event_filter *filter;
1737
1738 if (strcmp(call->class->system, system->name) != 0)
1266 continue; 1739 continue;
1267 1740
1268 /* really apply the filter */ 1741 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
1269 filter_disable_preds(call); 1742 if (!filter_item)
1270 err = replace_preds(call, filter, ps, filter_string, false); 1743 goto fail_mem;
1744
1745 list_add_tail(&filter_item->list, &filter_list);
1746
1747 filter_item->filter = __alloc_filter();
1748 if (!filter_item->filter)
1749 goto fail_mem;
1750 filter = filter_item->filter;
1751
1752 /* Can only fail on no memory */
1753 err = replace_filter_string(filter, filter_string);
1271 if (err) 1754 if (err)
1272 filter_disable_preds(call); 1755 goto fail_mem;
1273 else { 1756
1757 err = replace_preds(call, filter, ps, filter_string, false);
1758 if (err) {
1759 filter_disable(call);
1760 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1761 append_filter_err(ps, filter);
1762 } else
1274 call->flags |= TRACE_EVENT_FL_FILTERED; 1763 call->flags |= TRACE_EVENT_FL_FILTERED;
1275 replace_filter_string(filter, filter_string); 1764 /*
1276 } 1765 * Regardless of if this returned an error, we still
1766 * replace the filter for the call.
1767 */
1768 filter = call->filter;
1769 call->filter = filter_item->filter;
1770 filter_item->filter = filter;
1771
1277 fail = false; 1772 fail = false;
1278 } 1773 }
1279 1774
1280 if (fail) { 1775 if (fail)
1281 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1776 goto fail;
1282 return -EINVAL; 1777
1778 /*
1779 * The calls can still be using the old filters.
1780 * Do a synchronize_sched() to ensure all calls are
1781 * done with them before we free them.
1782 */
1783 synchronize_sched();
1784 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1785 __free_filter(filter_item->filter);
1786 list_del(&filter_item->list);
1787 kfree(filter_item);
1283 } 1788 }
1284 return 0; 1789 return 0;
1790 fail:
1791 /* No call succeeded */
1792 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1793 list_del(&filter_item->list);
1794 kfree(filter_item);
1795 }
1796 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1797 return -EINVAL;
1798 fail_mem:
1799 /* If any call succeeded, we still need to sync */
1800 if (!fail)
1801 synchronize_sched();
1802 list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
1803 __free_filter(filter_item->filter);
1804 list_del(&filter_item->list);
1805 kfree(filter_item);
1806 }
1807 return -ENOMEM;
1285} 1808}
1286 1809
1287int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1810int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1288{ 1811{
1289 int err;
1290 struct filter_parse_state *ps; 1812 struct filter_parse_state *ps;
1813 struct event_filter *filter;
1814 struct event_filter *tmp;
1815 int err = 0;
1291 1816
1292 mutex_lock(&event_mutex); 1817 mutex_lock(&event_mutex);
1293 1818
1294 err = init_preds(call);
1295 if (err)
1296 goto out_unlock;
1297
1298 if (!strcmp(strstrip(filter_string), "0")) { 1819 if (!strcmp(strstrip(filter_string), "0")) {
1299 filter_disable_preds(call); 1820 filter_disable(call);
1300 remove_filter_string(call->filter); 1821 filter = call->filter;
1822 if (!filter)
1823 goto out_unlock;
1824 call->filter = NULL;
1825 /* Make sure the filter is not being used */
1826 synchronize_sched();
1827 __free_filter(filter);
1301 goto out_unlock; 1828 goto out_unlock;
1302 } 1829 }
1303 1830
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1306 if (!ps) 1833 if (!ps)
1307 goto out_unlock; 1834 goto out_unlock;
1308 1835
1309 filter_disable_preds(call); 1836 filter = __alloc_filter();
1310 replace_filter_string(call->filter, filter_string); 1837 if (!filter) {
1838 kfree(ps);
1839 goto out_unlock;
1840 }
1841
1842 replace_filter_string(filter, filter_string);
1311 1843
1312 parse_init(ps, filter_ops, filter_string); 1844 parse_init(ps, filter_ops, filter_string);
1313 err = filter_parse(ps); 1845 err = filter_parse(ps);
1314 if (err) { 1846 if (err) {
1315 append_filter_err(ps, call->filter); 1847 append_filter_err(ps, filter);
1316 goto out; 1848 goto out;
1317 } 1849 }
1318 1850
1319 err = replace_preds(call, call->filter, ps, filter_string, false); 1851 err = replace_preds(call, filter, ps, filter_string, false);
1320 if (err) 1852 if (err) {
1321 append_filter_err(ps, call->filter); 1853 filter_disable(call);
1322 else 1854 append_filter_err(ps, filter);
1855 } else
1323 call->flags |= TRACE_EVENT_FL_FILTERED; 1856 call->flags |= TRACE_EVENT_FL_FILTERED;
1324out: 1857out:
1858 /*
1859 * Always swap the call filter with the new filter
1860 * even if there was an error. If there was an error
1861 * in the filter, we disable the filter and show the error
1862 * string
1863 */
1864 tmp = call->filter;
1865 call->filter = filter;
1866 if (tmp) {
1867 /* Make sure the call is done with the filter */
1868 synchronize_sched();
1869 __free_filter(tmp);
1870 }
1325 filter_opstack_clear(ps); 1871 filter_opstack_clear(ps);
1326 postfix_clear(ps); 1872 postfix_clear(ps);
1327 kfree(ps); 1873 kfree(ps);
@@ -1334,18 +1880,21 @@ out_unlock:
1334int apply_subsystem_event_filter(struct event_subsystem *system, 1880int apply_subsystem_event_filter(struct event_subsystem *system,
1335 char *filter_string) 1881 char *filter_string)
1336{ 1882{
1337 int err;
1338 struct filter_parse_state *ps; 1883 struct filter_parse_state *ps;
1884 struct event_filter *filter;
1885 int err = 0;
1339 1886
1340 mutex_lock(&event_mutex); 1887 mutex_lock(&event_mutex);
1341 1888
1342 err = init_subsystem_preds(system);
1343 if (err)
1344 goto out_unlock;
1345
1346 if (!strcmp(strstrip(filter_string), "0")) { 1889 if (!strcmp(strstrip(filter_string), "0")) {
1347 filter_free_subsystem_preds(system); 1890 filter_free_subsystem_preds(system);
1348 remove_filter_string(system->filter); 1891 remove_filter_string(system->filter);
1892 filter = system->filter;
1893 system->filter = NULL;
1894 /* Ensure all filters are no longer used */
1895 synchronize_sched();
1896 filter_free_subsystem_filters(system);
1897 __free_filter(filter);
1349 goto out_unlock; 1898 goto out_unlock;
1350 } 1899 }
1351 1900
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1354 if (!ps) 1903 if (!ps)
1355 goto out_unlock; 1904 goto out_unlock;
1356 1905
1357 replace_filter_string(system->filter, filter_string); 1906 filter = __alloc_filter();
1907 if (!filter)
1908 goto out;
1909
1910 replace_filter_string(filter, filter_string);
1911 /*
1912 * No event actually uses the system filter
1913 * we can free it without synchronize_sched().
1914 */
1915 __free_filter(system->filter);
1916 system->filter = filter;
1358 1917
1359 parse_init(ps, filter_ops, filter_string); 1918 parse_init(ps, filter_ops, filter_string);
1360 err = filter_parse(ps); 1919 err = filter_parse(ps);
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
1384 struct event_filter *filter = event->filter; 1943 struct event_filter *filter = event->filter;
1385 1944
1386 event->filter = NULL; 1945 event->filter = NULL;
1387 __free_preds(filter); 1946 __free_filter(filter);
1388} 1947}
1389 1948
1390int ftrace_profile_set_filter(struct perf_event *event, int event_id, 1949int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1410 if (event->filter) 1969 if (event->filter)
1411 goto out_unlock; 1970 goto out_unlock;
1412 1971
1413 filter = __alloc_preds(); 1972 filter = __alloc_filter();
1414 if (IS_ERR(filter)) { 1973 if (!filter) {
1415 err = PTR_ERR(filter); 1974 err = PTR_ERR(filter);
1416 goto out_unlock; 1975 goto out_unlock;
1417 } 1976 }
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1419 err = -ENOMEM; 1978 err = -ENOMEM;
1420 ps = kzalloc(sizeof(*ps), GFP_KERNEL); 1979 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1421 if (!ps) 1980 if (!ps)
1422 goto free_preds; 1981 goto free_filter;
1423 1982
1424 parse_init(ps, filter_ops, filter_str); 1983 parse_init(ps, filter_ops, filter_str);
1425 err = filter_parse(ps); 1984 err = filter_parse(ps);
@@ -1435,9 +1994,9 @@ free_ps:
1435 postfix_clear(ps); 1994 postfix_clear(ps);
1436 kfree(ps); 1995 kfree(ps);
1437 1996
1438free_preds: 1997free_filter:
1439 if (err) 1998 if (err)
1440 __free_preds(filter); 1999 __free_filter(filter);
1441 2000
1442out_unlock: 2001out_unlock:
1443 mutex_unlock(&event_mutex); 2002 mutex_unlock(&event_mutex);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2dec9bcde8b4..8435b43b1782 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
353 kfree(data); 353 kfree(data);
354} 354}
355 355
356/* Bitfield fetch function */
357struct bitfield_fetch_param {
358 struct fetch_param orig;
359 unsigned char hi_shift;
360 unsigned char low_shift;
361};
362
363#define DEFINE_FETCH_bitfield(type) \
364static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
365 void *data, void *dest) \
366{ \
367 struct bitfield_fetch_param *bprm = data; \
368 type buf = 0; \
369 call_fetch(&bprm->orig, regs, &buf); \
370 if (buf) { \
371 buf <<= bprm->hi_shift; \
372 buf >>= bprm->low_shift; \
373 } \
374 *(type *)dest = buf; \
375}
376DEFINE_BASIC_FETCH_FUNCS(bitfield)
377#define fetch_bitfield_string NULL
378#define fetch_bitfield_string_size NULL
379
380static __kprobes void
381free_bitfield_fetch_param(struct bitfield_fetch_param *data)
382{
383 /*
384 * Don't check the bitfield itself, because this must be the
385 * last fetch function.
386 */
387 if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
388 free_deref_fetch_param(data->orig.data);
389 else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
390 free_symbol_cache(data->orig.data);
391 kfree(data);
392}
356/* Default (unsigned long) fetch type */ 393/* Default (unsigned long) fetch type */
357#define __DEFAULT_FETCH_TYPE(t) u##t 394#define __DEFAULT_FETCH_TYPE(t) u##t
358#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) 395#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -367,6 +404,7 @@ enum {
367 FETCH_MTD_memory, 404 FETCH_MTD_memory,
368 FETCH_MTD_symbol, 405 FETCH_MTD_symbol,
369 FETCH_MTD_deref, 406 FETCH_MTD_deref,
407 FETCH_MTD_bitfield,
370 FETCH_MTD_END, 408 FETCH_MTD_END,
371}; 409};
372 410
@@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \
387ASSIGN_FETCH_FUNC(memory, ftype), \ 425ASSIGN_FETCH_FUNC(memory, ftype), \
388ASSIGN_FETCH_FUNC(symbol, ftype), \ 426ASSIGN_FETCH_FUNC(symbol, ftype), \
389ASSIGN_FETCH_FUNC(deref, ftype), \ 427ASSIGN_FETCH_FUNC(deref, ftype), \
428ASSIGN_FETCH_FUNC(bitfield, ftype), \
390 } \ 429 } \
391 } 430 }
392 431
@@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
430 if (!type) 469 if (!type)
431 type = DEFAULT_FETCH_TYPE_STR; 470 type = DEFAULT_FETCH_TYPE_STR;
432 471
472 /* Special case: bitfield */
473 if (*type == 'b') {
474 unsigned long bs;
475 type = strchr(type, '/');
476 if (!type)
477 goto fail;
478 type++;
479 if (strict_strtoul(type, 0, &bs))
480 goto fail;
481 switch (bs) {
482 case 8:
483 return find_fetch_type("u8");
484 case 16:
485 return find_fetch_type("u16");
486 case 32:
487 return find_fetch_type("u32");
488 case 64:
489 return find_fetch_type("u64");
490 default:
491 goto fail;
492 }
493 }
494
433 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) 495 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
434 if (strcmp(type, fetch_type_table[i].name) == 0) 496 if (strcmp(type, fetch_type_table[i].name) == 0)
435 return &fetch_type_table[i]; 497 return &fetch_type_table[i];
498fail:
436 return NULL; 499 return NULL;
437} 500}
438 501
@@ -586,7 +649,9 @@ error:
586 649
587static void free_probe_arg(struct probe_arg *arg) 650static void free_probe_arg(struct probe_arg *arg)
588{ 651{
589 if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) 652 if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
653 free_bitfield_fetch_param(arg->fetch.data);
654 else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
590 free_deref_fetch_param(arg->fetch.data); 655 free_deref_fetch_param(arg->fetch.data);
591 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) 656 else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
592 free_symbol_cache(arg->fetch.data); 657 free_symbol_cache(arg->fetch.data);
@@ -767,16 +832,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
767 } 832 }
768 break; 833 break;
769 case '+': /* deref memory */ 834 case '+': /* deref memory */
835 arg++; /* Skip '+', because strict_strtol() rejects it. */
770 case '-': 836 case '-':
771 tmp = strchr(arg, '('); 837 tmp = strchr(arg, '(');
772 if (!tmp) 838 if (!tmp)
773 break; 839 break;
774 *tmp = '\0'; 840 *tmp = '\0';
775 ret = strict_strtol(arg + 1, 0, &offset); 841 ret = strict_strtol(arg, 0, &offset);
776 if (ret) 842 if (ret)
777 break; 843 break;
778 if (arg[0] == '-')
779 offset = -offset;
780 arg = tmp + 1; 844 arg = tmp + 1;
781 tmp = strrchr(arg, ')'); 845 tmp = strrchr(arg, ')');
782 if (tmp) { 846 if (tmp) {
@@ -807,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
807 return ret; 871 return ret;
808} 872}
809 873
874#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
875
876/* Bitfield type needs to be parsed into a fetch function */
877static int __parse_bitfield_probe_arg(const char *bf,
878 const struct fetch_type *t,
879 struct fetch_param *f)
880{
881 struct bitfield_fetch_param *bprm;
882 unsigned long bw, bo;
883 char *tail;
884
885 if (*bf != 'b')
886 return 0;
887
888 bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
889 if (!bprm)
890 return -ENOMEM;
891 bprm->orig = *f;
892 f->fn = t->fetch[FETCH_MTD_bitfield];
893 f->data = (void *)bprm;
894
895 bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
896 if (bw == 0 || *tail != '@')
897 return -EINVAL;
898
899 bf = tail + 1;
900 bo = simple_strtoul(bf, &tail, 0);
901 if (tail == bf || *tail != '/')
902 return -EINVAL;
903
904 bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
905 bprm->low_shift = bprm->hi_shift + bo;
906 return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
907}
908
810/* String length checking wrapper */ 909/* String length checking wrapper */
811static int parse_probe_arg(char *arg, struct trace_probe *tp, 910static int parse_probe_arg(char *arg, struct trace_probe *tp,
812 struct probe_arg *parg, int is_return) 911 struct probe_arg *parg, int is_return)
@@ -836,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
836 parg->offset = tp->size; 935 parg->offset = tp->size;
837 tp->size += parg->type->size; 936 tp->size += parg->type->size;
838 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); 937 ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
938 if (ret >= 0 && t != NULL)
939 ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
839 if (ret >= 0) { 940 if (ret >= 0) {
840 parg->fetch_size.fn = get_fetch_size_function(parg->type, 941 parg->fetch_size.fn = get_fetch_size_function(parg->type,
841 parg->fetch.fn); 942 parg->fetch.fn);
@@ -1130,7 +1231,7 @@ static int command_trace_probe(const char *buf)
1130 return ret; 1231 return ret;
1131} 1232}
1132 1233
1133#define WRITE_BUFSIZE 128 1234#define WRITE_BUFSIZE 4096
1134 1235
1135static ssize_t probes_write(struct file *file, const char __user *buffer, 1236static ssize_t probes_write(struct file *file, const char __user *buffer,
1136 size_t count, loff_t *ppos) 1237 size_t count, loff_t *ppos)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272baa2206..456be9063c2d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -529,24 +529,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
529 * @entry: The trace entry field from the ring buffer 529 * @entry: The trace entry field from the ring buffer
530 * 530 *
531 * Prints the generic fields of irqs off, in hard or softirq, preempt 531 * Prints the generic fields of irqs off, in hard or softirq, preempt
532 * count and lock depth. 532 * count.
533 */ 533 */
534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) 534int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
535{ 535{
536 int hardirq, softirq; 536 char hardsoft_irq;
537 char need_resched;
538 char irqs_off;
539 int hardirq;
540 int softirq;
537 int ret; 541 int ret;
538 542
539 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 543 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
540 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 544 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
541 545
546 irqs_off =
547 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
548 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
549 '.';
550 need_resched =
551 (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
552 hardsoft_irq =
553 (hardirq && softirq) ? 'H' :
554 hardirq ? 'h' :
555 softirq ? 's' :
556 '.';
557
542 if (!trace_seq_printf(s, "%c%c%c", 558 if (!trace_seq_printf(s, "%c%c%c",
543 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 559 irqs_off, need_resched, hardsoft_irq))
544 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
545 'X' : '.',
546 (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
547 'N' : '.',
548 (hardirq && softirq) ? 'H' :
549 hardirq ? 'h' : softirq ? 's' : '.'))
550 return 0; 560 return 0;
551 561
552 if (entry->preempt_count) 562 if (entry->preempt_count)
@@ -554,13 +564,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
554 else 564 else
555 ret = trace_seq_putc(s, '.'); 565 ret = trace_seq_putc(s, '.');
556 566
557 if (!ret) 567 return ret;
558 return 0;
559
560 if (entry->lock_depth < 0)
561 return trace_seq_putc(s, '.');
562
563 return trace_seq_printf(s, "%d", entry->lock_depth);
564} 568}
565 569
566static int 570static int
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c43..7e62c0a18456 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
247 ctx_trace = tr; 247 ctx_trace = tr;
248} 248}
249 249
250static void stop_sched_trace(struct trace_array *tr)
251{
252 tracing_stop_sched_switch_record();
253}
254
255static int sched_switch_trace_init(struct trace_array *tr)
256{
257 ctx_trace = tr;
258 tracing_reset_online_cpus(tr);
259 tracing_start_sched_switch_record();
260 return 0;
261}
262
263static void sched_switch_trace_reset(struct trace_array *tr)
264{
265 if (sched_ref)
266 stop_sched_trace(tr);
267}
268
269static void sched_switch_trace_start(struct trace_array *tr)
270{
271 sched_stopped = 0;
272}
273
274static void sched_switch_trace_stop(struct trace_array *tr)
275{
276 sched_stopped = 1;
277}
278
279static struct tracer sched_switch_trace __read_mostly =
280{
281 .name = "sched_switch",
282 .init = sched_switch_trace_init,
283 .reset = sched_switch_trace_reset,
284 .start = sched_switch_trace_start,
285 .stop = sched_switch_trace_stop,
286 .wait_pipe = poll_wait_pipe,
287#ifdef CONFIG_FTRACE_SELFTEST
288 .selftest = trace_selftest_startup_sched_switch,
289#endif
290};
291
292__init static int init_sched_switch_trace(void)
293{
294 return register_tracer(&sched_switch_trace);
295}
296device_initcall(init_sched_switch_trace);
297
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5c9fe08d2093..ee7b5a0bb9f8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[];
60 60
61static struct syscall_metadata **syscalls_metadata; 61static struct syscall_metadata **syscalls_metadata;
62 62
63#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
64static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
65{
66 /*
67 * Only compare after the "sys" prefix. Archs that use
68 * syscall wrappers may have syscalls symbols aliases prefixed
69 * with "SyS" instead of "sys", leading to an unwanted
70 * mismatch.
71 */
72 return !strcmp(sym + 3, name + 3);
73}
74#endif
75
63static __init struct syscall_metadata * 76static __init struct syscall_metadata *
64find_syscall_meta(unsigned long syscall) 77find_syscall_meta(unsigned long syscall)
65{ 78{
@@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall)
72 stop = __stop_syscalls_metadata; 85 stop = __stop_syscalls_metadata;
73 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 86 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
74 87
88 if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
89 return NULL;
90
75 for ( ; start < stop; start++) { 91 for ( ; start < stop; start++) {
76 /* 92 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
77 * Only compare after the "sys" prefix. Archs that use
78 * syscall wrappers may have syscalls symbols aliases prefixed
79 * with "SyS" instead of "sys", leading to an unwanted
80 * mismatch.
81 */
82 if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
83 return *start; 93 return *start;
84 } 94 }
85 return NULL; 95 return NULL;
@@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
359 int num; 369 int num;
360 370
361 num = ((struct syscall_metadata *)call->data)->syscall_nr; 371 num = ((struct syscall_metadata *)call->data)->syscall_nr;
362 if (num < 0 || num >= NR_syscalls) 372 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
363 return -ENOSYS; 373 return -ENOSYS;
364 mutex_lock(&syscall_trace_lock); 374 mutex_lock(&syscall_trace_lock);
365 if (!sys_refcount_enter) 375 if (!sys_refcount_enter)
@@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
377 int num; 387 int num;
378 388
379 num = ((struct syscall_metadata *)call->data)->syscall_nr; 389 num = ((struct syscall_metadata *)call->data)->syscall_nr;
380 if (num < 0 || num >= NR_syscalls) 390 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
381 return; 391 return;
382 mutex_lock(&syscall_trace_lock); 392 mutex_lock(&syscall_trace_lock);
383 sys_refcount_enter--; 393 sys_refcount_enter--;
@@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
393 int num; 403 int num;
394 404
395 num = ((struct syscall_metadata *)call->data)->syscall_nr; 405 num = ((struct syscall_metadata *)call->data)->syscall_nr;
396 if (num < 0 || num >= NR_syscalls) 406 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
397 return -ENOSYS; 407 return -ENOSYS;
398 mutex_lock(&syscall_trace_lock); 408 mutex_lock(&syscall_trace_lock);
399 if (!sys_refcount_exit) 409 if (!sys_refcount_exit)
@@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
411 int num; 421 int num;
412 422
413 num = ((struct syscall_metadata *)call->data)->syscall_nr; 423 num = ((struct syscall_metadata *)call->data)->syscall_nr;
414 if (num < 0 || num >= NR_syscalls) 424 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
415 return; 425 return;
416 mutex_lock(&syscall_trace_lock); 426 mutex_lock(&syscall_trace_lock);
417 sys_refcount_exit--; 427 sys_refcount_exit--;
@@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
424int init_syscall_trace(struct ftrace_event_call *call) 434int init_syscall_trace(struct ftrace_event_call *call)
425{ 435{
426 int id; 436 int id;
437 int num;
438
439 num = ((struct syscall_metadata *)call->data)->syscall_nr;
440 if (num < 0 || num >= NR_syscalls) {
441 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
442 ((struct syscall_metadata *)call->data)->name);
443 return -ENOSYS;
444 }
427 445
428 if (set_syscall_print_fmt(call) < 0) 446 if (set_syscall_print_fmt(call) < 0)
429 return -ENOMEM; 447 return -ENOMEM;
@@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
438 return id; 456 return id;
439} 457}
440 458
441unsigned long __init arch_syscall_addr(int nr) 459unsigned long __init __weak arch_syscall_addr(int nr)
442{ 460{
443 return (unsigned long)sys_call_table[nr]; 461 return (unsigned long)sys_call_table[nr];
444} 462}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee6578b578ad..b5fe4c00eb3c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -316,6 +316,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
316 316
317static struct debug_obj_descr work_debug_descr; 317static struct debug_obj_descr work_debug_descr;
318 318
319static void *work_debug_hint(void *addr)
320{
321 return ((struct work_struct *) addr)->func;
322}
323
319/* 324/*
320 * fixup_init is called when: 325 * fixup_init is called when:
321 * - an active object is initialized 326 * - an active object is initialized
@@ -387,6 +392,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
387 392
388static struct debug_obj_descr work_debug_descr = { 393static struct debug_obj_descr work_debug_descr = {
389 .name = "work_struct", 394 .name = "work_struct",
395 .debug_hint = work_debug_hint,
390 .fixup_init = work_fixup_init, 396 .fixup_init = work_fixup_init,
391 .fixup_activate = work_fixup_activate, 397 .fixup_activate = work_fixup_activate,
392 .fixup_free = work_fixup_free, 398 .fixup_free = work_fixup_free,