aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2008-10-20 07:28:56 -0400
committerThomas Gleixner <tglx@linutronix.de>2008-10-20 07:28:56 -0400
commitb6a4b7de4cb45ccf7157fc58de09c96f84d67108 (patch)
tree016630974bdcb00fe529b673f96d389e0fd6dc94 /kernel
parent2778d0d51dd5007c4909c1d9874f5e9097785a7a (diff)
parent651dab4264e4ba0e563f5ff56f748127246e9065 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/arjan/linux-2.6-hrtimer into timers/range-hrtimers
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/auditsc.c9
-rw-r--r--kernel/cgroup.c9
-rw-r--r--kernel/compat.c58
-rw-r--r--kernel/cpu.c24
-rw-r--r--kernel/cpuset.c351
-rw-r--r--kernel/dma-coherent.c2
-rw-r--r--kernel/dma.c2
-rw-r--r--kernel/exit.c21
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/hrtimer.c95
-rw-r--r--kernel/irq/manage.c9
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kgdb.c13
-rw-r--r--kernel/kmod.c67
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/ksysfs.c35
-rw-r--r--kernel/module.c45
-rw-r--r--kernel/panic.c65
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/disk.c11
-rw-r--r--kernel/power/main.c7
-rw-r--r--kernel/power/user.c10
-rw-r--r--kernel/printk.c42
-rw-r--r--kernel/profile.c41
-rw-r--r--kernel/rcuclassic.c337
-rw-r--r--kernel/rcupreempt.c8
-rw-r--r--kernel/rcupreempt_trace.c7
-rw-r--r--kernel/resource.c150
-rw-r--r--kernel/sched.c408
-rw-r--r--kernel/sched_clock.c6
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c234
-rw-r--r--kernel/sched_features.h1
-rw-r--r--kernel/sched_idletask.c6
-rw-r--r--kernel/sched_rt.c58
-rw-r--r--kernel/softirq.c13
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/sys.c38
-rw-r--r--kernel/sys_ni.c6
-rw-r--r--kernel/sysctl.c123
-rw-r--r--kernel/time/Kconfig1
-rw-r--r--kernel/time/clockevents.c15
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-broadcast.c99
-rw-r--r--kernel/time/tick-common.c15
-rw-r--r--kernel/time/tick-internal.h11
-rw-r--r--kernel/time/tick-oneshot.c44
-rw-r--r--kernel/time/tick-sched.c26
-rw-r--r--kernel/timer.c1
-rw-r--r--kernel/trace/trace_sysprof.c2
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/utsname_sysctl.c5
-rw-r--r--kernel/wait.c14
-rw-r--r--kernel/workqueue.c2
56 files changed, 1676 insertions, 900 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index dd68b9059418..f6006a60df5d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -548,7 +548,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
548#endif 548#endif
549 549
550 spin_lock_irq(&current->sighand->siglock); 550 spin_lock_irq(&current->sighand->siglock);
551 tty = current->signal->tty; 551 tty = current->signal->tty; /* Safe as we hold the siglock */
552 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; 552 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
553 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); 553 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
554 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); 554 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 59cedfb040e7..cf5bc2f5f9c3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -246,8 +246,8 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
246 unsigned n; 246 unsigned n;
247 if (unlikely(!ctx)) 247 if (unlikely(!ctx))
248 return 0; 248 return 0;
249
250 n = ctx->major; 249 n = ctx->major;
250
251 switch (audit_classify_syscall(ctx->arch, n)) { 251 switch (audit_classify_syscall(ctx->arch, n)) {
252 case 0: /* native */ 252 case 0: /* native */
253 if ((mask & AUDIT_PERM_WRITE) && 253 if ((mask & AUDIT_PERM_WRITE) &&
@@ -1204,13 +1204,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1204 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 1204 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
1205 context->return_code); 1205 context->return_code);
1206 1206
1207 mutex_lock(&tty_mutex); 1207 spin_lock_irq(&tsk->sighand->siglock);
1208 read_lock(&tasklist_lock);
1209 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 1208 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
1210 tty = tsk->signal->tty->name; 1209 tty = tsk->signal->tty->name;
1211 else 1210 else
1212 tty = "(none)"; 1211 tty = "(none)";
1213 read_unlock(&tasklist_lock); 1212 spin_unlock_irq(&tsk->sighand->siglock);
1213
1214 audit_log_format(ab, 1214 audit_log_format(ab,
1215 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 1215 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
1216 " ppid=%d pid=%d auid=%u uid=%u gid=%u" 1216 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
@@ -1230,7 +1230,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1230 context->egid, context->sgid, context->fsgid, tty, 1230 context->egid, context->sgid, context->fsgid, tty,
1231 tsk->sessionid); 1231 tsk->sessionid);
1232 1232
1233 mutex_unlock(&tty_mutex);
1234 1233
1235 audit_log_task_info(ab, tsk); 1234 audit_log_task_info(ab, tsk);
1236 if (context->filterkey) { 1235 if (context->filterkey) {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 13932abde159..8c6e1c17e6d3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2735,21 +2735,24 @@ void cgroup_fork_callbacks(struct task_struct *child)
2735 * Called on every change to mm->owner. mm_init_owner() does not 2735 * Called on every change to mm->owner. mm_init_owner() does not
2736 * invoke this routine, since it assigns the mm->owner the first time 2736 * invoke this routine, since it assigns the mm->owner the first time
2737 * and does not change it. 2737 * and does not change it.
2738 *
2739 * The callbacks are invoked with mmap_sem held in read mode.
2738 */ 2740 */
2739void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) 2741void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2740{ 2742{
2741 struct cgroup *oldcgrp, *newcgrp; 2743 struct cgroup *oldcgrp, *newcgrp = NULL;
2742 2744
2743 if (need_mm_owner_callback) { 2745 if (need_mm_owner_callback) {
2744 int i; 2746 int i;
2745 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2747 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2746 struct cgroup_subsys *ss = subsys[i]; 2748 struct cgroup_subsys *ss = subsys[i];
2747 oldcgrp = task_cgroup(old, ss->subsys_id); 2749 oldcgrp = task_cgroup(old, ss->subsys_id);
2748 newcgrp = task_cgroup(new, ss->subsys_id); 2750 if (new)
2751 newcgrp = task_cgroup(new, ss->subsys_id);
2749 if (oldcgrp == newcgrp) 2752 if (oldcgrp == newcgrp)
2750 continue; 2753 continue;
2751 if (ss->mm_owner_changed) 2754 if (ss->mm_owner_changed)
2752 ss->mm_owner_changed(ss, oldcgrp, newcgrp); 2755 ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
2753 } 2756 }
2754 } 2757 }
2755} 2758}
diff --git a/kernel/compat.c b/kernel/compat.c
index 32c254a8ab9a..143990e48cb9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -26,6 +26,64 @@
26 26
27#include <asm/uaccess.h> 27#include <asm/uaccess.h>
28 28
29/*
30 * Note that the native side is already converted to a timespec, because
31 * that's what we want anyway.
32 */
33static int compat_get_timeval(struct timespec *o,
34 struct compat_timeval __user *i)
35{
36 long usec;
37
38 if (get_user(o->tv_sec, &i->tv_sec) ||
39 get_user(usec, &i->tv_usec))
40 return -EFAULT;
41 o->tv_nsec = usec * 1000;
42 return 0;
43}
44
45static int compat_put_timeval(struct compat_timeval __user *o,
46 struct timeval *i)
47{
48 return (put_user(i->tv_sec, &o->tv_sec) ||
49 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
50}
51
52asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
53 struct timezone __user *tz)
54{
55 if (tv) {
56 struct timeval ktv;
57 do_gettimeofday(&ktv);
58 if (compat_put_timeval(tv, &ktv))
59 return -EFAULT;
60 }
61 if (tz) {
62 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
63 return -EFAULT;
64 }
65
66 return 0;
67}
68
69asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
70 struct timezone __user *tz)
71{
72 struct timespec kts;
73 struct timezone ktz;
74
75 if (tv) {
76 if (compat_get_timeval(&kts, tv))
77 return -EFAULT;
78 }
79 if (tz) {
80 if (copy_from_user(&ktz, tz, sizeof(ktz)))
81 return -EFAULT;
82 }
83
84 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
85}
86
29int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) 87int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
30{ 88{
31 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || 89 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f17e9854c246..86d49045daed 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
199 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
200 int err; 200 int err;
201 201
202 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
203 param->hcpu);
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
205 err = __cpu_disable(); 203 err = __cpu_disable();
206 if (err < 0) 204 if (err < 0)
207 return err; 205 return err;
208 206
207 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
208 param->hcpu);
209
209 /* Force idle task to run as soon as we yield: it should 210 /* Force idle task to run as soon as we yield: it should
210 immediately notice cpu is offline and die quickly. */ 211 immediately notice cpu is offline and die quickly. */
211 sched_idle_next(); 212 sched_idle_next();
@@ -453,6 +454,25 @@ out:
453} 454}
454#endif /* CONFIG_PM_SLEEP_SMP */ 455#endif /* CONFIG_PM_SLEEP_SMP */
455 456
457/**
458 * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
459 * @cpu: cpu that just started
460 *
461 * This function calls the cpu_chain notifiers with CPU_STARTING.
462 * It must be called by the arch code on the new cpu, before the new cpu
463 * enables interrupts and before the "boot" cpu returns from __cpu_up().
464 */
465void notify_cpu_starting(unsigned int cpu)
466{
467 unsigned long val = CPU_STARTING;
468
469#ifdef CONFIG_PM_SLEEP_SMP
470 if (cpu_isset(cpu, frozen_cpus))
471 val = CPU_STARTING_FROZEN;
472#endif /* CONFIG_PM_SLEEP_SMP */
473 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
474}
475
456#endif /* CONFIG_SMP */ 476#endif /* CONFIG_SMP */
457 477
458/* 478/*
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5ab79cf516d..eab7bd6628e0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
14 * 2003-10-22 Updates by Stephen Hemminger. 14 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson. 15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups 16 * 2006 Rework by Paul Menage to use generic cgroups
17 * 2008 Rework of the scheduler domains and CPU hotplug handling
18 * by Max Krasnyansky
17 * 19 *
18 * This file is subject to the terms and conditions of the GNU General Public 20 * This file is subject to the terms and conditions of the GNU General Public
19 * License. See the file COPYING in the main directory of the Linux 21 * License. See the file COPYING in the main directory of the Linux
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = {
236 238
237static DEFINE_MUTEX(callback_mutex); 239static DEFINE_MUTEX(callback_mutex);
238 240
239/* This is ugly, but preserves the userspace API for existing cpuset 241/*
242 * This is ugly, but preserves the userspace API for existing cpuset
240 * users. If someone tries to mount the "cpuset" filesystem, we 243 * users. If someone tries to mount the "cpuset" filesystem, we
241 * silently switch it to mount "cgroup" instead */ 244 * silently switch it to mount "cgroup" instead
245 */
242static int cpuset_get_sb(struct file_system_type *fs_type, 246static int cpuset_get_sb(struct file_system_type *fs_type,
243 int flags, const char *unused_dev_name, 247 int flags, const char *unused_dev_name,
244 void *data, struct vfsmount *mnt) 248 void *data, struct vfsmount *mnt)
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
473} 477}
474 478
475/* 479/*
476 * Helper routine for rebuild_sched_domains(). 480 * Helper routine for generate_sched_domains().
477 * Do cpusets a, b have overlapping cpus_allowed masks? 481 * Do cpusets a, b have overlapping cpus_allowed masks?
478 */ 482 */
479
480static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 483static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
481{ 484{
482 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 485 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
518} 521}
519 522
520/* 523/*
521 * rebuild_sched_domains() 524 * generate_sched_domains()
522 * 525 *
523 * This routine will be called to rebuild the scheduler's dynamic 526 * This function builds a partial partition of the systems CPUs
524 * sched domains: 527 * A 'partial partition' is a set of non-overlapping subsets whose
525 * - if the flag 'sched_load_balance' of any cpuset with non-empty 528 * union is a subset of that set.
526 * 'cpus' changes, 529 * The output of this function needs to be passed to kernel/sched.c
527 * - or if the 'cpus' allowed changes in any cpuset which has that 530 * partition_sched_domains() routine, which will rebuild the scheduler's
528 * flag enabled, 531 * load balancing domains (sched domains) as specified by that partial
529 * - or if the 'sched_relax_domain_level' of any cpuset which has 532 * partition.
530 * that flag enabled and with non-empty 'cpus' changes,
531 * - or if any cpuset with non-empty 'cpus' is removed,
532 * - or if a cpu gets offlined.
533 *
534 * This routine builds a partial partition of the systems CPUs
535 * (the set of non-overlappping cpumask_t's in the array 'part'
536 * below), and passes that partial partition to the kernel/sched.c
537 * partition_sched_domains() routine, which will rebuild the
538 * schedulers load balancing domains (sched domains) as specified
539 * by that partial partition. A 'partial partition' is a set of
540 * non-overlapping subsets whose union is a subset of that set.
541 * 533 *
542 * See "What is sched_load_balance" in Documentation/cpusets.txt 534 * See "What is sched_load_balance" in Documentation/cpusets.txt
543 * for a background explanation of this. 535 * for a background explanation of this.
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
547 * domains when operating in the severe memory shortage situations 539 * domains when operating in the severe memory shortage situations
548 * that could cause allocation failures below. 540 * that could cause allocation failures below.
549 * 541 *
550 * Call with cgroup_mutex held. May take callback_mutex during 542 * Must be called with cgroup_lock held.
551 * call due to the kfifo_alloc() and kmalloc() calls. May nest
552 * a call to the get_online_cpus()/put_online_cpus() pair.
553 * Must not be called holding callback_mutex, because we must not
554 * call get_online_cpus() while holding callback_mutex. Elsewhere
555 * the kernel nests callback_mutex inside get_online_cpus() calls.
556 * So the reverse nesting would risk an ABBA deadlock.
557 * 543 *
558 * The three key local variables below are: 544 * The three key local variables below are:
559 * q - a linked-list queue of cpuset pointers, used to implement a 545 * q - a linked-list queue of cpuset pointers, used to implement a
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
588 * element of the partition (one sched domain) to be passed to 574 * element of the partition (one sched domain) to be passed to
589 * partition_sched_domains(). 575 * partition_sched_domains().
590 */ 576 */
591 577static int generate_sched_domains(cpumask_t **domains,
592void rebuild_sched_domains(void) 578 struct sched_domain_attr **attributes)
593{ 579{
594 LIST_HEAD(q); /* queue of cpusets to be scanned*/ 580 LIST_HEAD(q); /* queue of cpusets to be scanned */
595 struct cpuset *cp; /* scans q */ 581 struct cpuset *cp; /* scans q */
596 struct cpuset **csa; /* array of all cpuset ptrs */ 582 struct cpuset **csa; /* array of all cpuset ptrs */
597 int csn; /* how many cpuset ptrs in csa so far */ 583 int csn; /* how many cpuset ptrs in csa so far */
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void)
601 int ndoms; /* number of sched domains in result */ 587 int ndoms; /* number of sched domains in result */
602 int nslot; /* next empty doms[] cpumask_t slot */ 588 int nslot; /* next empty doms[] cpumask_t slot */
603 589
604 csa = NULL; 590 ndoms = 0;
605 doms = NULL; 591 doms = NULL;
606 dattr = NULL; 592 dattr = NULL;
593 csa = NULL;
607 594
608 /* Special case for the 99% of systems with one, full, sched domain */ 595 /* Special case for the 99% of systems with one, full, sched domain */
609 if (is_sched_load_balance(&top_cpuset)) { 596 if (is_sched_load_balance(&top_cpuset)) {
610 ndoms = 1;
611 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
612 if (!doms) 598 if (!doms)
613 goto rebuild; 599 goto done;
600
614 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 601 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
615 if (dattr) { 602 if (dattr) {
616 *dattr = SD_ATTR_INIT; 603 *dattr = SD_ATTR_INIT;
617 update_domain_attr_tree(dattr, &top_cpuset); 604 update_domain_attr_tree(dattr, &top_cpuset);
618 } 605 }
619 *doms = top_cpuset.cpus_allowed; 606 *doms = top_cpuset.cpus_allowed;
620 goto rebuild; 607
608 ndoms = 1;
609 goto done;
621 } 610 }
622 611
623 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 612 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
@@ -680,61 +669,141 @@ restart:
680 } 669 }
681 } 670 }
682 671
683 /* Convert <csn, csa> to <ndoms, doms> */ 672 /*
673 * Now we know how many domains to create.
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */
684 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
685 if (!doms) 677 if (!doms) {
686 goto rebuild; 678 ndoms = 0;
679 goto done;
680 }
681
682 /*
683 * The rest of the code, including the scheduler, can deal with
684 * dattr==NULL case. No need to abort if alloc fails.
685 */
687 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); 686 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
688 687
689 for (nslot = 0, i = 0; i < csn; i++) { 688 for (nslot = 0, i = 0; i < csn; i++) {
690 struct cpuset *a = csa[i]; 689 struct cpuset *a = csa[i];
690 cpumask_t *dp;
691 int apn = a->pn; 691 int apn = a->pn;
692 692
693 if (apn >= 0) { 693 if (apn < 0) {
694 cpumask_t *dp = doms + nslot; 694 /* Skip completed partitions */
695 695 continue;
696 if (nslot == ndoms) { 696 }
697 static int warnings = 10; 697
698 if (warnings) { 698 dp = doms + nslot;
699 printk(KERN_WARNING 699
700 "rebuild_sched_domains confused:" 700 if (nslot == ndoms) {
701 " nslot %d, ndoms %d, csn %d, i %d," 701 static int warnings = 10;
702 " apn %d\n", 702 if (warnings) {
703 nslot, ndoms, csn, i, apn); 703 printk(KERN_WARNING
704 warnings--; 704 "rebuild_sched_domains confused:"
705 } 705 " nslot %d, ndoms %d, csn %d, i %d,"
706 continue; 706 " apn %d\n",
707 nslot, ndoms, csn, i, apn);
708 warnings--;
707 } 709 }
710 continue;
711 }
708 712
709 cpus_clear(*dp); 713 cpus_clear(*dp);
710 if (dattr) 714 if (dattr)
711 *(dattr + nslot) = SD_ATTR_INIT; 715 *(dattr + nslot) = SD_ATTR_INIT;
712 for (j = i; j < csn; j++) { 716 for (j = i; j < csn; j++) {
713 struct cpuset *b = csa[j]; 717 struct cpuset *b = csa[j];
714 718
715 if (apn == b->pn) { 719 if (apn == b->pn) {
716 cpus_or(*dp, *dp, b->cpus_allowed); 720 cpus_or(*dp, *dp, b->cpus_allowed);
717 b->pn = -1; 721 if (dattr)
718 if (dattr) 722 update_domain_attr_tree(dattr + nslot, b);
719 update_domain_attr_tree(dattr 723
720 + nslot, b); 724 /* Done with this partition */
721 } 725 b->pn = -1;
722 } 726 }
723 nslot++;
724 } 727 }
728 nslot++;
725 } 729 }
726 BUG_ON(nslot != ndoms); 730 BUG_ON(nslot != ndoms);
727 731
728rebuild: 732done:
729 /* Have scheduler rebuild sched domains */ 733 kfree(csa);
734
735 *domains = doms;
736 *attributes = dattr;
737 return ndoms;
738}
739
740/*
741 * Rebuild scheduler domains.
742 *
743 * Call with neither cgroup_mutex held nor within get_online_cpus().
744 * Takes both cgroup_mutex and get_online_cpus().
745 *
746 * Cannot be directly called from cpuset code handling changes
747 * to the cpuset pseudo-filesystem, because it cannot be called
748 * from code that already holds cgroup_mutex.
749 */
750static void do_rebuild_sched_domains(struct work_struct *unused)
751{
752 struct sched_domain_attr *attr;
753 cpumask_t *doms;
754 int ndoms;
755
730 get_online_cpus(); 756 get_online_cpus();
731 partition_sched_domains(ndoms, doms, dattr); 757
758 /* Generate domain masks and attrs */
759 cgroup_lock();
760 ndoms = generate_sched_domains(&doms, &attr);
761 cgroup_unlock();
762
763 /* Have scheduler rebuild the domains */
764 partition_sched_domains(ndoms, doms, attr);
765
732 put_online_cpus(); 766 put_online_cpus();
767}
733 768
734done: 769static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
735 kfree(csa); 770
736 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 771/*
737 /* Don't kfree(dattr) -- partition_sched_domains() does that. */ 772 * Rebuild scheduler domains, asynchronously via workqueue.
773 *
774 * If the flag 'sched_load_balance' of any cpuset with non-empty
775 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
776 * which has that flag enabled, or if any cpuset with a non-empty
777 * 'cpus' is removed, then call this routine to rebuild the
778 * scheduler's dynamic sched domains.
779 *
780 * The rebuild_sched_domains() and partition_sched_domains()
781 * routines must nest cgroup_lock() inside get_online_cpus(),
782 * but such cpuset changes as these must nest that locking the
783 * other way, holding cgroup_lock() for much of the code.
784 *
785 * So in order to avoid an ABBA deadlock, the cpuset code handling
786 * these user changes delegates the actual sched domain rebuilding
787 * to a separate workqueue thread, which ends up processing the
788 * above do_rebuild_sched_domains() function.
789 */
790static void async_rebuild_sched_domains(void)
791{
792 schedule_work(&rebuild_sched_domains_work);
793}
794
795/*
796 * Accomplishes the same scheduler domain rebuild as the above
797 * async_rebuild_sched_domains(), however it directly calls the
798 * rebuild routine synchronously rather than calling it via an
799 * asynchronous work thread.
800 *
801 * This can only be called from code that is not holding
802 * cgroup_mutex (not nested in a cgroup_lock() call.)
803 */
804void rebuild_sched_domains(void)
805{
806 do_rebuild_sched_domains(NULL);
738} 807}
739 808
740/** 809/**
@@ -774,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
774/** 843/**
775 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 844 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
776 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 845 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
846 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
777 * 847 *
778 * Called with cgroup_mutex held 848 * Called with cgroup_mutex held
779 * 849 *
780 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 850 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
781 * calling callback functions for each. 851 * calling callback functions for each.
782 * 852 *
783 * Return 0 if successful, -errno if not. 853 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
854 * if @heap != NULL.
784 */ 855 */
785static int update_tasks_cpumask(struct cpuset *cs) 856static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
786{ 857{
787 struct cgroup_scanner scan; 858 struct cgroup_scanner scan;
788 struct ptr_heap heap;
789 int retval;
790
791 /*
792 * cgroup_scan_tasks() will initialize heap->gt for us.
793 * heap_init() is still needed here for we should not change
794 * cs->cpus_allowed when heap_init() fails.
795 */
796 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
797 if (retval)
798 return retval;
799 859
800 scan.cg = cs->css.cgroup; 860 scan.cg = cs->css.cgroup;
801 scan.test_task = cpuset_test_cpumask; 861 scan.test_task = cpuset_test_cpumask;
802 scan.process_task = cpuset_change_cpumask; 862 scan.process_task = cpuset_change_cpumask;
803 scan.heap = &heap; 863 scan.heap = heap;
804 retval = cgroup_scan_tasks(&scan); 864 cgroup_scan_tasks(&scan);
805
806 heap_free(&heap);
807 return retval;
808} 865}
809 866
810/** 867/**
@@ -814,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs)
814 */ 871 */
815static int update_cpumask(struct cpuset *cs, const char *buf) 872static int update_cpumask(struct cpuset *cs, const char *buf)
816{ 873{
874 struct ptr_heap heap;
817 struct cpuset trialcs; 875 struct cpuset trialcs;
818 int retval; 876 int retval;
819 int is_load_balanced; 877 int is_load_balanced;
@@ -848,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
848 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 906 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
849 return 0; 907 return 0;
850 908
909 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
910 if (retval)
911 return retval;
912
851 is_load_balanced = is_sched_load_balance(&trialcs); 913 is_load_balanced = is_sched_load_balance(&trialcs);
852 914
853 mutex_lock(&callback_mutex); 915 mutex_lock(&callback_mutex);
@@ -858,12 +920,12 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
858 * Scan tasks in the cpuset, and update the cpumasks of any 920 * Scan tasks in the cpuset, and update the cpumasks of any
859 * that need an update. 921 * that need an update.
860 */ 922 */
861 retval = update_tasks_cpumask(cs); 923 update_tasks_cpumask(cs, &heap);
862 if (retval < 0) 924
863 return retval; 925 heap_free(&heap);
864 926
865 if (is_load_balanced) 927 if (is_load_balanced)
866 rebuild_sched_domains(); 928 async_rebuild_sched_domains();
867 return 0; 929 return 0;
868} 930}
869 931
@@ -1090,7 +1152,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1090 if (val != cs->relax_domain_level) { 1152 if (val != cs->relax_domain_level) {
1091 cs->relax_domain_level = val; 1153 cs->relax_domain_level = val;
1092 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) 1154 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
1093 rebuild_sched_domains(); 1155 async_rebuild_sched_domains();
1094 } 1156 }
1095 1157
1096 return 0; 1158 return 0;
@@ -1131,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1131 mutex_unlock(&callback_mutex); 1193 mutex_unlock(&callback_mutex);
1132 1194
1133 if (cpus_nonempty && balance_flag_changed) 1195 if (cpus_nonempty && balance_flag_changed)
1134 rebuild_sched_domains(); 1196 async_rebuild_sched_domains();
1135 1197
1136 return 0; 1198 return 0;
1137} 1199}
@@ -1492,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1492 default: 1554 default:
1493 BUG(); 1555 BUG();
1494 } 1556 }
1557
1558 /* Unreachable but makes gcc happy */
1559 return 0;
1495} 1560}
1496 1561
1497static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1562static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1504,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1504 default: 1569 default:
1505 BUG(); 1570 BUG();
1506 } 1571 }
1572
1573 /* Unrechable but makes gcc happy */
1574 return 0;
1507} 1575}
1508 1576
1509 1577
@@ -1692,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create(
1692} 1760}
1693 1761
1694/* 1762/*
1695 * Locking note on the strange update_flag() call below:
1696 *
1697 * If the cpuset being removed has its flag 'sched_load_balance' 1763 * If the cpuset being removed has its flag 'sched_load_balance'
1698 * enabled, then simulate turning sched_load_balance off, which 1764 * enabled, then simulate turning sched_load_balance off, which
1699 * will call rebuild_sched_domains(). The get_online_cpus() 1765 * will call async_rebuild_sched_domains().
1700 * call in rebuild_sched_domains() must not be made while holding
1701 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1702 * get_online_cpus() calls. So the reverse nesting would risk an
1703 * ABBA deadlock.
1704 */ 1766 */
1705 1767
1706static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1768static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1719,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1719struct cgroup_subsys cpuset_subsys = { 1781struct cgroup_subsys cpuset_subsys = {
1720 .name = "cpuset", 1782 .name = "cpuset",
1721 .create = cpuset_create, 1783 .create = cpuset_create,
1722 .destroy = cpuset_destroy, 1784 .destroy = cpuset_destroy,
1723 .can_attach = cpuset_can_attach, 1785 .can_attach = cpuset_can_attach,
1724 .attach = cpuset_attach, 1786 .attach = cpuset_attach,
1725 .populate = cpuset_populate, 1787 .populate = cpuset_populate,
@@ -1811,7 +1873,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1811} 1873}
1812 1874
1813/* 1875/*
1814 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1876 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
1815 * or memory nodes, we need to walk over the cpuset hierarchy, 1877 * or memory nodes, we need to walk over the cpuset hierarchy,
1816 * removing that CPU or node from all cpusets. If this removes the 1878 * removing that CPU or node from all cpusets. If this removes the
1817 * last CPU or node from a cpuset, then move the tasks in the empty 1879 * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1859,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1859 * that has tasks along with an empty 'mems'. But if we did see such 1921 * that has tasks along with an empty 'mems'. But if we did see such
1860 * a cpuset, we'd handle it just like we do if its 'cpus' was empty. 1922 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1861 */ 1923 */
1862static void scan_for_empty_cpusets(const struct cpuset *root) 1924static void scan_for_empty_cpusets(struct cpuset *root)
1863{ 1925{
1864 LIST_HEAD(queue); 1926 LIST_HEAD(queue);
1865 struct cpuset *cp; /* scans cpusets being updated */ 1927 struct cpuset *cp; /* scans cpusets being updated */
@@ -1896,42 +1958,13 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1896 nodes_empty(cp->mems_allowed)) 1958 nodes_empty(cp->mems_allowed))
1897 remove_tasks_in_empty_cpuset(cp); 1959 remove_tasks_in_empty_cpuset(cp);
1898 else { 1960 else {
1899 update_tasks_cpumask(cp); 1961 update_tasks_cpumask(cp, NULL);
1900 update_tasks_nodemask(cp, &oldmems); 1962 update_tasks_nodemask(cp, &oldmems);
1901 } 1963 }
1902 } 1964 }
1903} 1965}
1904 1966
1905/* 1967/*
1906 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1907 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1908 * track what's online after any CPU or memory node hotplug or unplug event.
1909 *
1910 * Since there are two callers of this routine, one for CPU hotplug
1911 * events and one for memory node hotplug events, we could have coded
1912 * two separate routines here. We code it as a single common routine
1913 * in order to minimize text size.
1914 */
1915
1916static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1917{
1918 cgroup_lock();
1919
1920 top_cpuset.cpus_allowed = cpu_online_map;
1921 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1922 scan_for_empty_cpusets(&top_cpuset);
1923
1924 /*
1925 * Scheduler destroys domains on hotplug events.
1926 * Rebuild them based on the current settings.
1927 */
1928 if (rebuild_sd)
1929 rebuild_sched_domains();
1930
1931 cgroup_unlock();
1932}
1933
1934/*
1935 * The top_cpuset tracks what CPUs and Memory Nodes are online, 1968 * The top_cpuset tracks what CPUs and Memory Nodes are online,
1936 * period. This is necessary in order to make cpusets transparent 1969 * period. This is necessary in order to make cpusets transparent
1937 * (of no affect) on systems that are actively using CPU hotplug 1970 * (of no affect) on systems that are actively using CPU hotplug
@@ -1939,40 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1939 * 1972 *
1940 * This routine ensures that top_cpuset.cpus_allowed tracks 1973 * This routine ensures that top_cpuset.cpus_allowed tracks
1941 * cpu_online_map on each CPU hotplug (cpuhp) event. 1974 * cpu_online_map on each CPU hotplug (cpuhp) event.
1975 *
1976 * Called within get_online_cpus(). Needs to call cgroup_lock()
1977 * before calling generate_sched_domains().
1942 */ 1978 */
1943 1979static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1944static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1945 unsigned long phase, void *unused_cpu) 1980 unsigned long phase, void *unused_cpu)
1946{ 1981{
1982 struct sched_domain_attr *attr;
1983 cpumask_t *doms;
1984 int ndoms;
1985
1947 switch (phase) { 1986 switch (phase) {
1948 case CPU_UP_CANCELED:
1949 case CPU_UP_CANCELED_FROZEN:
1950 case CPU_DOWN_FAILED:
1951 case CPU_DOWN_FAILED_FROZEN:
1952 case CPU_ONLINE: 1987 case CPU_ONLINE:
1953 case CPU_ONLINE_FROZEN: 1988 case CPU_ONLINE_FROZEN:
1954 case CPU_DEAD: 1989 case CPU_DEAD:
1955 case CPU_DEAD_FROZEN: 1990 case CPU_DEAD_FROZEN:
1956 common_cpu_mem_hotplug_unplug(1);
1957 break; 1991 break;
1992
1958 default: 1993 default:
1959 return NOTIFY_DONE; 1994 return NOTIFY_DONE;
1960 } 1995 }
1961 1996
1997 cgroup_lock();
1998 top_cpuset.cpus_allowed = cpu_online_map;
1999 scan_for_empty_cpusets(&top_cpuset);
2000 ndoms = generate_sched_domains(&doms, &attr);
2001 cgroup_unlock();
2002
2003 /* Have scheduler rebuild the domains */
2004 partition_sched_domains(ndoms, doms, attr);
2005
1962 return NOTIFY_OK; 2006 return NOTIFY_OK;
1963} 2007}
1964 2008
1965#ifdef CONFIG_MEMORY_HOTPLUG 2009#ifdef CONFIG_MEMORY_HOTPLUG
1966/* 2010/*
1967 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2011 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
1968 * Call this routine anytime after you change 2012 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
1969 * node_states[N_HIGH_MEMORY]. 2013 * See also the previous routine cpuset_track_online_cpus().
1970 * See also the previous routine cpuset_handle_cpuhp().
1971 */ 2014 */
1972
1973void cpuset_track_online_nodes(void) 2015void cpuset_track_online_nodes(void)
1974{ 2016{
1975 common_cpu_mem_hotplug_unplug(0); 2017 cgroup_lock();
2018 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2019 scan_for_empty_cpusets(&top_cpuset);
2020 cgroup_unlock();
1976} 2021}
1977#endif 2022#endif
1978 2023
@@ -1987,7 +2032,7 @@ void __init cpuset_init_smp(void)
1987 top_cpuset.cpus_allowed = cpu_online_map; 2032 top_cpuset.cpus_allowed = cpu_online_map;
1988 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2033 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1989 2034
1990 hotcpu_notifier(cpuset_handle_cpuhp, 0); 2035 hotcpu_notifier(cpuset_track_online_cpus, 0);
1991} 2036}
1992 2037
1993/** 2038/**
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index c1d4d5b4c61c..f013a0c2e111 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -124,6 +124,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
124 } 124 }
125 return (mem != NULL); 125 return (mem != NULL);
126} 126}
127EXPORT_SYMBOL(dma_alloc_from_coherent);
127 128
128/** 129/**
129 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool 130 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
@@ -151,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
151 } 152 }
152 return 0; 153 return 0;
153} 154}
155EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/dma.c b/kernel/dma.c
index d2c60a822790..f903189c5304 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -1,4 +1,4 @@
1/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $ 1/*
2 * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. 2 * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
3 * 3 *
4 * Written by Hennus Bergman, 1992. 4 * Written by Hennus Bergman, 1992.
diff --git a/kernel/exit.c b/kernel/exit.c
index 16395644a98f..0ef4673e351b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
583 * If there are other users of the mm and the owner (us) is exiting 583 * If there are other users of the mm and the owner (us) is exiting
584 * we need to find a new owner to take on the responsibility. 584 * we need to find a new owner to take on the responsibility.
585 */ 585 */
586 if (!mm)
587 return 0;
588 if (atomic_read(&mm->mm_users) <= 1) 586 if (atomic_read(&mm->mm_users) <= 1)
589 return 0; 587 return 0;
590 if (mm->owner != p) 588 if (mm->owner != p)
@@ -627,29 +625,38 @@ retry:
627 } while_each_thread(g, c); 625 } while_each_thread(g, c);
628 626
629 read_unlock(&tasklist_lock); 627 read_unlock(&tasklist_lock);
628 /*
629 * We found no owner yet mm_users > 1: this implies that we are
630 * most likely racing with swapoff (try_to_unuse()) or /proc or
631 * ptrace or page migration (get_task_mm()). Mark owner as NULL,
632 * so that subsystems can understand the callback and take action.
633 */
634 down_write(&mm->mmap_sem);
635 cgroup_mm_owner_callbacks(mm->owner, NULL);
636 mm->owner = NULL;
637 up_write(&mm->mmap_sem);
630 return; 638 return;
631 639
632assign_new_owner: 640assign_new_owner:
633 BUG_ON(c == p); 641 BUG_ON(c == p);
634 get_task_struct(c); 642 get_task_struct(c);
643 read_unlock(&tasklist_lock);
644 down_write(&mm->mmap_sem);
635 /* 645 /*
636 * The task_lock protects c->mm from changing. 646 * The task_lock protects c->mm from changing.
637 * We always want mm->owner->mm == mm 647 * We always want mm->owner->mm == mm
638 */ 648 */
639 task_lock(c); 649 task_lock(c);
640 /*
641 * Delay read_unlock() till we have the task_lock()
642 * to ensure that c does not slip away underneath us
643 */
644 read_unlock(&tasklist_lock);
645 if (c->mm != mm) { 650 if (c->mm != mm) {
646 task_unlock(c); 651 task_unlock(c);
652 up_write(&mm->mmap_sem);
647 put_task_struct(c); 653 put_task_struct(c);
648 goto retry; 654 goto retry;
649 } 655 }
650 cgroup_mm_owner_callbacks(mm->owner, c); 656 cgroup_mm_owner_callbacks(mm->owner, c);
651 mm->owner = c; 657 mm->owner = c;
652 task_unlock(c); 658 task_unlock(c);
659 up_write(&mm->mmap_sem);
653 put_task_struct(c); 660 put_task_struct(c);
654} 661}
655#endif /* CONFIG_MM_OWNER */ 662#endif /* CONFIG_MM_OWNER */
diff --git a/kernel/fork.c b/kernel/fork.c
index 4308d75f0fa5..37b3e150ae39 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -802,6 +802,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
802 802
803 sig->leader = 0; /* session leadership doesn't inherit */ 803 sig->leader = 0; /* session leadership doesn't inherit */
804 sig->tty_old_pgrp = NULL; 804 sig->tty_old_pgrp = NULL;
805 sig->tty = NULL;
805 806
806 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 807 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
807 sig->gtime = cputime_zero; 808 sig->gtime = cputime_zero;
@@ -838,6 +839,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
838void __cleanup_signal(struct signal_struct *sig) 839void __cleanup_signal(struct signal_struct *sig)
839{ 840{
840 exit_thread_group_keys(sig); 841 exit_thread_group_keys(sig);
842 tty_kref_put(sig->tty);
841 kmem_cache_free(signal_cachep, sig); 843 kmem_cache_free(signal_cachep, sig);
842} 844}
843 845
@@ -1229,7 +1231,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1229 p->nsproxy->pid_ns->child_reaper = p; 1231 p->nsproxy->pid_ns->child_reaper = p;
1230 1232
1231 p->signal->leader_pid = pid; 1233 p->signal->leader_pid = pid;
1232 p->signal->tty = current->signal->tty; 1234 tty_kref_put(p->signal->tty);
1235 p->signal->tty = tty_kref_get(current->signal->tty);
1233 set_task_pgrp(p, task_pgrp_nr(current)); 1236 set_task_pgrp(p, task_pgrp_nr(current));
1234 set_task_session(p, task_session_nr(current)); 1237 set_task_session(p, task_session_nr(current));
1235 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1238 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2bd230be1cb5..51ee90bca2de 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
672 */ 672 */
673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART); 673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
674 return 1; 674 return 1;
675 case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: 675 case HRTIMER_CB_IRQSAFE_PERCPU:
676 case HRTIMER_CB_IRQSAFE_UNLOCKED:
676 /* 677 /*
677 * This is solely for the sched tick emulation with 678 * This is solely for the sched tick emulation with
678 * dynamic tick support to ensure that we do not 679 * dynamic tick support to ensure that we do not
679 * restart the tick right on the edge and end up with 680 * restart the tick right on the edge and end up with
680 * the tick timer in the softirq ! The calling site 681 * the tick timer in the softirq ! The calling site
681 * takes care of this. 682 * takes care of this. Also used for hrtimer sleeper !
682 */ 683 */
683 debug_hrtimer_deactivate(timer); 684 debug_hrtimer_deactivate(timer);
684 return 1; 685 return 1;
@@ -1266,7 +1267,8 @@ static void __run_hrtimer(struct hrtimer *timer)
1266 timer_stats_account_hrtimer(timer); 1267 timer_stats_account_hrtimer(timer);
1267 1268
1268 fn = timer->function; 1269 fn = timer->function;
1269 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { 1270 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1271 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
1270 /* 1272 /*
1271 * Used for scheduler timers, avoid lock inversion with 1273 * Used for scheduler timers, avoid lock inversion with
1272 * rq->lock and tasklist_lock. 1274 * rq->lock and tasklist_lock.
@@ -1517,7 +1519,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1517 sl->timer.function = hrtimer_wakeup; 1519 sl->timer.function = hrtimer_wakeup;
1518 sl->task = task; 1520 sl->task = task;
1519#ifdef CONFIG_HIGH_RES_TIMERS 1521#ifdef CONFIG_HIGH_RES_TIMERS
1520 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1522 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
1521#endif 1523#endif
1522} 1524}
1523 1525
@@ -1661,29 +1663,95 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1661 1663
1662#ifdef CONFIG_HOTPLUG_CPU 1664#ifdef CONFIG_HOTPLUG_CPU
1663 1665
1664static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1666static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1665 struct hrtimer_clock_base *new_base) 1667 struct hrtimer_clock_base *new_base, int dcpu)
1666{ 1668{
1667 struct hrtimer *timer; 1669 struct hrtimer *timer;
1668 struct rb_node *node; 1670 struct rb_node *node;
1671 int raise = 0;
1669 1672
1670 while ((node = rb_first(&old_base->active))) { 1673 while ((node = rb_first(&old_base->active))) {
1671 timer = rb_entry(node, struct hrtimer, node); 1674 timer = rb_entry(node, struct hrtimer, node);
1672 BUG_ON(hrtimer_callback_running(timer)); 1675 BUG_ON(hrtimer_callback_running(timer));
1673 debug_hrtimer_deactivate(timer); 1676 debug_hrtimer_deactivate(timer);
1674 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); 1677
1678 /*
1679 * Should not happen. Per CPU timers should be
1680 * canceled _before_ the migration code is called
1681 */
1682 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
1683 __remove_hrtimer(timer, old_base,
1684 HRTIMER_STATE_INACTIVE, 0);
1685 WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
1686 timer, timer->function, dcpu);
1687 continue;
1688 }
1689
1690 /*
1691 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1692 * timer could be seen as !active and just vanish away
1693 * under us on another CPU
1694 */
1695 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1675 timer->base = new_base; 1696 timer->base = new_base;
1676 /* 1697 /*
1677 * Enqueue the timer. Allow reprogramming of the event device 1698 * Enqueue the timer. Allow reprogramming of the event device
1678 */ 1699 */
1679 enqueue_hrtimer(timer, new_base, 1); 1700 enqueue_hrtimer(timer, new_base, 1);
1701
1702#ifdef CONFIG_HIGH_RES_TIMERS
1703 /*
1704 * Happens with high res enabled when the timer was
1705 * already expired and the callback mode is
1706 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
1707 * enqueue code does not move them to the soft irq
1708 * pending list for performance/latency reasons, but
1709 * in the migration state, we need to do that
1710 * otherwise we end up with a stale timer.
1711 */
1712 if (timer->state == HRTIMER_STATE_MIGRATE) {
1713 timer->state = HRTIMER_STATE_PENDING;
1714 list_add_tail(&timer->cb_entry,
1715 &new_base->cpu_base->cb_pending);
1716 raise = 1;
1717 }
1718#endif
1719 /* Clear the migration state bit */
1720 timer->state &= ~HRTIMER_STATE_MIGRATE;
1721 }
1722 return raise;
1723}
1724
1725#ifdef CONFIG_HIGH_RES_TIMERS
1726static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1727 struct hrtimer_cpu_base *new_base)
1728{
1729 struct hrtimer *timer;
1730 int raise = 0;
1731
1732 while (!list_empty(&old_base->cb_pending)) {
1733 timer = list_entry(old_base->cb_pending.next,
1734 struct hrtimer, cb_entry);
1735
1736 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
1737 timer->base = &new_base->clock_base[timer->base->index];
1738 list_add_tail(&timer->cb_entry, &new_base->cb_pending);
1739 raise = 1;
1680 } 1740 }
1741 return raise;
1742}
1743#else
1744static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1745 struct hrtimer_cpu_base *new_base)
1746{
1747 return 0;
1681} 1748}
1749#endif
1682 1750
1683static void migrate_hrtimers(int cpu) 1751static void migrate_hrtimers(int cpu)
1684{ 1752{
1685 struct hrtimer_cpu_base *old_base, *new_base; 1753 struct hrtimer_cpu_base *old_base, *new_base;
1686 int i; 1754 int i, raise = 0;
1687 1755
1688 BUG_ON(cpu_online(cpu)); 1756 BUG_ON(cpu_online(cpu));
1689 old_base = &per_cpu(hrtimer_bases, cpu); 1757 old_base = &per_cpu(hrtimer_bases, cpu);
@@ -1696,14 +1764,21 @@ static void migrate_hrtimers(int cpu)
1696 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1764 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1697 1765
1698 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1766 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1699 migrate_hrtimer_list(&old_base->clock_base[i], 1767 if (migrate_hrtimer_list(&old_base->clock_base[i],
1700 &new_base->clock_base[i]); 1768 &new_base->clock_base[i], cpu))
1769 raise = 1;
1701 } 1770 }
1702 1771
1772 if (migrate_hrtimer_pending(old_base, new_base))
1773 raise = 1;
1774
1703 spin_unlock(&old_base->lock); 1775 spin_unlock(&old_base->lock);
1704 spin_unlock(&new_base->lock); 1776 spin_unlock(&new_base->lock);
1705 local_irq_enable(); 1777 local_irq_enable();
1706 put_cpu_var(hrtimer_bases); 1778 put_cpu_var(hrtimer_bases);
1779
1780 if (raise)
1781 hrtimer_raise_softirq();
1707} 1782}
1708#endif /* CONFIG_HOTPLUG_CPU */ 1783#endif /* CONFIG_HOTPLUG_CPU */
1709 1784
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0314074fa232..60c49e324390 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -89,7 +89,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
89 set_balance_irq_affinity(irq, cpumask); 89 set_balance_irq_affinity(irq, cpumask);
90 90
91#ifdef CONFIG_GENERIC_PENDING_IRQ 91#ifdef CONFIG_GENERIC_PENDING_IRQ
92 set_pending_irq(irq, cpumask); 92 if (desc->status & IRQ_MOVE_PCNTXT) {
93 unsigned long flags;
94
95 spin_lock_irqsave(&desc->lock, flags);
96 desc->chip->set_affinity(irq, cpumask);
97 spin_unlock_irqrestore(&desc->lock, flags);
98 } else
99 set_pending_irq(irq, cpumask);
93#else 100#else
94 desc->affinity = cpumask; 101 desc->affinity = cpumask;
95 desc->chip->set_affinity(irq, cpumask); 102 desc->chip->set_affinity(irq, cpumask);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 38fc10ac7541..5072cf1685a2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -260,7 +260,6 @@ const char *kallsyms_lookup(unsigned long addr,
260 /* see if it's in a module */ 260 /* see if it's in a module */
261 return module_address_lookup(addr, symbolsize, offset, modname, 261 return module_address_lookup(addr, symbolsize, offset, modname,
262 namebuf); 262 namebuf);
263 return NULL;
264} 263}
265 264
266int lookup_symbol_name(unsigned long addr, char *symname) 265int lookup_symbol_name(unsigned long addr, char *symname)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 59f3f0df35d4..aef265325cd3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image,
753 *old = addr | (*old & ~PAGE_MASK); 753 *old = addr | (*old & ~PAGE_MASK);
754 754
755 /* The old page I have found cannot be a 755 /* The old page I have found cannot be a
756 * destination page, so return it. 756 * destination page, so return it if it's
757 * gfp_flags honor the ones passed in.
757 */ 758 */
759 if (!(gfp_mask & __GFP_HIGHMEM) &&
760 PageHighMem(old_page)) {
761 kimage_free_pages(old_page);
762 continue;
763 }
758 addr = old_addr; 764 addr = old_addr;
759 page = old_page; 765 page = old_page;
760 break; 766 break;
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index eaa21fc9ad1d..e4dcfb2272a4 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -488,7 +488,7 @@ static int write_mem_msg(int binary)
488 if (err) 488 if (err)
489 return err; 489 return err;
490 if (CACHE_FLUSH_IS_SAFE) 490 if (CACHE_FLUSH_IS_SAFE)
491 flush_icache_range(addr, addr + length + 1); 491 flush_icache_range(addr, addr + length);
492 return 0; 492 return 0;
493 } 493 }
494 494
@@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs)
590 590
591 /* Signal the primary CPU that we are done: */ 591 /* Signal the primary CPU that we are done: */
592 atomic_set(&cpu_in_kgdb[cpu], 0); 592 atomic_set(&cpu_in_kgdb[cpu], 0);
593 touch_softlockup_watchdog();
593 clocksource_touch_watchdog(); 594 clocksource_touch_watchdog();
594 local_irq_restore(flags); 595 local_irq_restore(flags);
595} 596}
@@ -1432,6 +1433,7 @@ acquirelock:
1432 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
1433 1434
1434 atomic_set(&kgdb_active, -1); 1435 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog();
1435 clocksource_touch_watchdog(); 1437 clocksource_touch_watchdog();
1436 local_irq_restore(flags); 1438 local_irq_restore(flags);
1437 1439
@@ -1462,7 +1464,7 @@ acquirelock:
1462 * Get the passive CPU lock which will hold all the non-primary 1464 * Get the passive CPU lock which will hold all the non-primary
1463 * CPU in a spin state while the debugger is active 1465 * CPU in a spin state while the debugger is active
1464 */ 1466 */
1465 if (!kgdb_single_step || !kgdb_contthread) { 1467 if (!kgdb_single_step) {
1466 for (i = 0; i < NR_CPUS; i++) 1468 for (i = 0; i < NR_CPUS; i++)
1467 atomic_set(&passive_cpu_wait[i], 1); 1469 atomic_set(&passive_cpu_wait[i], 1);
1468 } 1470 }
@@ -1475,7 +1477,7 @@ acquirelock:
1475 1477
1476#ifdef CONFIG_SMP 1478#ifdef CONFIG_SMP
1477 /* Signal the other CPUs to enter kgdb_wait() */ 1479 /* Signal the other CPUs to enter kgdb_wait() */
1478 if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) 1480 if ((!kgdb_single_step) && kgdb_do_roundup)
1479 kgdb_roundup_cpus(flags); 1481 kgdb_roundup_cpus(flags);
1480#endif 1482#endif
1481 1483
@@ -1494,7 +1496,7 @@ acquirelock:
1494 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); 1496 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
1495 kgdb_deactivate_sw_breakpoints(); 1497 kgdb_deactivate_sw_breakpoints();
1496 kgdb_single_step = 0; 1498 kgdb_single_step = 0;
1497 kgdb_contthread = NULL; 1499 kgdb_contthread = current;
1498 exception_level = 0; 1500 exception_level = 0;
1499 1501
1500 /* Talk to debugger with gdbserial protocol */ 1502 /* Talk to debugger with gdbserial protocol */
@@ -1508,7 +1510,7 @@ acquirelock:
1508 kgdb_info[ks->cpu].task = NULL; 1510 kgdb_info[ks->cpu].task = NULL;
1509 atomic_set(&cpu_in_kgdb[ks->cpu], 0); 1511 atomic_set(&cpu_in_kgdb[ks->cpu], 0);
1510 1512
1511 if (!kgdb_single_step || !kgdb_contthread) { 1513 if (!kgdb_single_step) {
1512 for (i = NR_CPUS-1; i >= 0; i--) 1514 for (i = NR_CPUS-1; i >= 0; i--)
1513 atomic_set(&passive_cpu_wait[i], 0); 1515 atomic_set(&passive_cpu_wait[i], 0);
1514 /* 1516 /*
@@ -1524,6 +1526,7 @@ acquirelock:
1524kgdb_restore: 1526kgdb_restore:
1525 /* Free kgdb_active */ 1527 /* Free kgdb_active */
1526 atomic_set(&kgdb_active, -1); 1528 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog();
1527 clocksource_touch_watchdog(); 1530 clocksource_touch_watchdog();
1528 local_irq_restore(flags); 1531 local_irq_restore(flags);
1529 1532
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2456d1a0befb..3d3c3ea3a023 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -113,7 +113,7 @@ int request_module(const char *fmt, ...)
113 return ret; 113 return ret;
114} 114}
115EXPORT_SYMBOL(request_module); 115EXPORT_SYMBOL(request_module);
116#endif /* CONFIG_KMOD */ 116#endif /* CONFIG_MODULES */
117 117
118struct subprocess_info { 118struct subprocess_info {
119 struct work_struct work; 119 struct work_struct work;
@@ -265,7 +265,7 @@ static void __call_usermodehelper(struct work_struct *work)
265 } 265 }
266} 266}
267 267
268#ifdef CONFIG_PM 268#ifdef CONFIG_PM_SLEEP
269/* 269/*
270 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 270 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
271 * (used for preventing user land processes from being created after the user 271 * (used for preventing user land processes from being created after the user
@@ -288,39 +288,37 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
288 */ 288 */
289#define RUNNING_HELPERS_TIMEOUT (5 * HZ) 289#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
290 290
291static int usermodehelper_pm_callback(struct notifier_block *nfb, 291/**
292 unsigned long action, 292 * usermodehelper_disable - prevent new helpers from being started
293 void *ignored) 293 */
294int usermodehelper_disable(void)
294{ 295{
295 long retval; 296 long retval;
296 297
297 switch (action) { 298 usermodehelper_disabled = 1;
298 case PM_HIBERNATION_PREPARE: 299 smp_mb();
299 case PM_SUSPEND_PREPARE: 300 /*
300 usermodehelper_disabled = 1; 301 * From now on call_usermodehelper_exec() won't start any new
301 smp_mb(); 302 * helpers, so it is sufficient if running_helpers turns out to
302 /* 303 * be zero at one point (it may be increased later, but that
303 * From now on call_usermodehelper_exec() won't start any new 304 * doesn't matter).
304 * helpers, so it is sufficient if running_helpers turns out to 305 */
305 * be zero at one point (it may be increased later, but that 306 retval = wait_event_timeout(running_helpers_waitq,
306 * doesn't matter).
307 */
308 retval = wait_event_timeout(running_helpers_waitq,
309 atomic_read(&running_helpers) == 0, 307 atomic_read(&running_helpers) == 0,
310 RUNNING_HELPERS_TIMEOUT); 308 RUNNING_HELPERS_TIMEOUT);
311 if (retval) { 309 if (retval)
312 return NOTIFY_OK; 310 return 0;
313 } else {
314 usermodehelper_disabled = 0;
315 return NOTIFY_BAD;
316 }
317 case PM_POST_HIBERNATION:
318 case PM_POST_SUSPEND:
319 usermodehelper_disabled = 0;
320 return NOTIFY_OK;
321 }
322 311
323 return NOTIFY_DONE; 312 usermodehelper_disabled = 0;
313 return -EAGAIN;
314}
315
316/**
317 * usermodehelper_enable - allow new helpers to be started again
318 */
319void usermodehelper_enable(void)
320{
321 usermodehelper_disabled = 0;
324} 322}
325 323
326static void helper_lock(void) 324static void helper_lock(void)
@@ -334,18 +332,12 @@ static void helper_unlock(void)
334 if (atomic_dec_and_test(&running_helpers)) 332 if (atomic_dec_and_test(&running_helpers))
335 wake_up(&running_helpers_waitq); 333 wake_up(&running_helpers_waitq);
336} 334}
337 335#else /* CONFIG_PM_SLEEP */
338static void register_pm_notifier_callback(void)
339{
340 pm_notifier(usermodehelper_pm_callback, 0);
341}
342#else /* CONFIG_PM */
343#define usermodehelper_disabled 0 336#define usermodehelper_disabled 0
344 337
345static inline void helper_lock(void) {} 338static inline void helper_lock(void) {}
346static inline void helper_unlock(void) {} 339static inline void helper_unlock(void) {}
347static inline void register_pm_notifier_callback(void) {} 340#endif /* CONFIG_PM_SLEEP */
348#endif /* CONFIG_PM */
349 341
350/** 342/**
351 * call_usermodehelper_setup - prepare to call a usermode helper 343 * call_usermodehelper_setup - prepare to call a usermode helper
@@ -515,5 +507,4 @@ void __init usermodehelper_init(void)
515{ 507{
516 khelper_wq = create_singlethread_workqueue("khelper"); 508 khelper_wq = create_singlethread_workqueue("khelper");
517 BUG_ON(!khelper_wq); 509 BUG_ON(!khelper_wq);
518 register_pm_notifier_callback();
519} 510}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 75bc2cd9ebc6..8b57a2597f21 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -404,7 +404,7 @@ void kretprobe_hash_lock(struct task_struct *tsk,
404 spin_lock_irqsave(hlist_lock, *flags); 404 spin_lock_irqsave(hlist_lock, *flags);
405} 405}
406 406
407void kretprobe_table_lock(unsigned long hash, unsigned long *flags) 407static void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
408{ 408{
409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
410 spin_lock_irqsave(hlist_lock, *flags); 410 spin_lock_irqsave(hlist_lock, *flags);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e53bc30e9ba5..08dd8ed86c77 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h>
17#include <linux/sched.h> 18#include <linux/sched.h>
18 19
19#define KERNEL_ATTR_RO(_name) \ 20#define KERNEL_ATTR_RO(_name) \
@@ -53,6 +54,37 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
53KERNEL_ATTR_RW(uevent_helper); 54KERNEL_ATTR_RW(uevent_helper);
54#endif 55#endif
55 56
57#ifdef CONFIG_PROFILING
58static ssize_t profiling_show(struct kobject *kobj,
59 struct kobj_attribute *attr, char *buf)
60{
61 return sprintf(buf, "%d\n", prof_on);
62}
63static ssize_t profiling_store(struct kobject *kobj,
64 struct kobj_attribute *attr,
65 const char *buf, size_t count)
66{
67 int ret;
68
69 if (prof_on)
70 return -EEXIST;
71 /*
72 * This eventually calls into get_option() which
73 * has a ton of callers and is not const. It is
74 * easiest to cast it away here.
75 */
76 profile_setup((char *)buf);
77 ret = profile_init();
78 if (ret)
79 return ret;
80 ret = create_proc_profile();
81 if (ret)
82 return ret;
83 return count;
84}
85KERNEL_ATTR_RW(profiling);
86#endif
87
56#ifdef CONFIG_KEXEC 88#ifdef CONFIG_KEXEC
57static ssize_t kexec_loaded_show(struct kobject *kobj, 89static ssize_t kexec_loaded_show(struct kobject *kobj,
58 struct kobj_attribute *attr, char *buf) 90 struct kobj_attribute *attr, char *buf)
@@ -109,6 +141,9 @@ static struct attribute * kernel_attrs[] = {
109 &uevent_seqnum_attr.attr, 141 &uevent_seqnum_attr.attr,
110 &uevent_helper_attr.attr, 142 &uevent_helper_attr.attr,
111#endif 143#endif
144#ifdef CONFIG_PROFILING
145 &profiling_attr.attr,
146#endif
112#ifdef CONFIG_KEXEC 147#ifdef CONFIG_KEXEC
113 &kexec_loaded_attr.attr, 148 &kexec_loaded_attr.attr,
114 &kexec_crash_loaded_attr.attr, 149 &kexec_crash_loaded_attr.attr,
diff --git a/kernel/module.c b/kernel/module.c
index 9db11911e04b..b7205f67cfaf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -100,7 +100,7 @@ static inline int strong_try_module_get(struct module *mod)
100static inline void add_taint_module(struct module *mod, unsigned flag) 100static inline void add_taint_module(struct module *mod, unsigned flag)
101{ 101{
102 add_taint(flag); 102 add_taint(flag);
103 mod->taints |= flag; 103 mod->taints |= (1U << flag);
104} 104}
105 105
106/* 106/*
@@ -784,6 +784,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
784 mutex_lock(&module_mutex); 784 mutex_lock(&module_mutex);
785 /* Store the name of the last unloaded module for diagnostic purposes */ 785 /* Store the name of the last unloaded module for diagnostic purposes */
786 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 786 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
787 unregister_dynamic_debug_module(mod->name);
787 free_module(mod); 788 free_module(mod);
788 789
789 out: 790 out:
@@ -923,7 +924,7 @@ static const char vermagic[] = VERMAGIC_STRING;
923static int try_to_force_load(struct module *mod, const char *symname) 924static int try_to_force_load(struct module *mod, const char *symname)
924{ 925{
925#ifdef CONFIG_MODULE_FORCE_LOAD 926#ifdef CONFIG_MODULE_FORCE_LOAD
926 if (!(tainted & TAINT_FORCED_MODULE)) 927 if (!test_taint(TAINT_FORCED_MODULE))
927 printk("%s: no version for \"%s\" found: kernel tainted.\n", 928 printk("%s: no version for \"%s\" found: kernel tainted.\n",
928 mod->name, symname); 929 mod->name, symname);
929 add_taint_module(mod, TAINT_FORCED_MODULE); 930 add_taint_module(mod, TAINT_FORCED_MODULE);
@@ -1033,7 +1034,7 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
1033 const unsigned long *crc; 1034 const unsigned long *crc;
1034 1035
1035 ret = find_symbol(name, &owner, &crc, 1036 ret = find_symbol(name, &owner, &crc,
1036 !(mod->taints & TAINT_PROPRIETARY_MODULE), true); 1037 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
1037 if (!IS_ERR_VALUE(ret)) { 1038 if (!IS_ERR_VALUE(ret)) {
1038 /* use_module can fail due to OOM, 1039 /* use_module can fail due to OOM,
1039 or module initialization or unloading */ 1040 or module initialization or unloading */
@@ -1173,7 +1174,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
1173 while (i-- > 0) 1174 while (i-- > 0)
1174 sysfs_remove_bin_file(notes_attrs->dir, 1175 sysfs_remove_bin_file(notes_attrs->dir,
1175 &notes_attrs->attrs[i]); 1176 &notes_attrs->attrs[i]);
1176 kobject_del(notes_attrs->dir); 1177 kobject_put(notes_attrs->dir);
1177 } 1178 }
1178 kfree(notes_attrs); 1179 kfree(notes_attrs);
1179} 1180}
@@ -1634,7 +1635,7 @@ static void set_license(struct module *mod, const char *license)
1634 license = "unspecified"; 1635 license = "unspecified";
1635 1636
1636 if (!license_is_gpl_compatible(license)) { 1637 if (!license_is_gpl_compatible(license)) {
1637 if (!(tainted & TAINT_PROPRIETARY_MODULE)) 1638 if (!test_taint(TAINT_PROPRIETARY_MODULE))
1638 printk(KERN_WARNING "%s: module license '%s' taints " 1639 printk(KERN_WARNING "%s: module license '%s' taints "
1639 "kernel.\n", mod->name, license); 1640 "kernel.\n", mod->name, license);
1640 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1641 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
@@ -1783,6 +1784,33 @@ static inline void add_kallsyms(struct module *mod,
1783} 1784}
1784#endif /* CONFIG_KALLSYMS */ 1785#endif /* CONFIG_KALLSYMS */
1785 1786
1787#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
1788static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex)
1789{
1790 struct mod_debug *debug_info;
1791 unsigned long pos, end;
1792 unsigned int num_verbose;
1793
1794 pos = sechdrs[verboseindex].sh_addr;
1795 num_verbose = sechdrs[verboseindex].sh_size /
1796 sizeof(struct mod_debug);
1797 end = pos + (num_verbose * sizeof(struct mod_debug));
1798
1799 for (; pos < end; pos += sizeof(struct mod_debug)) {
1800 debug_info = (struct mod_debug *)pos;
1801 register_dynamic_debug_module(debug_info->modname,
1802 debug_info->type, debug_info->logical_modname,
1803 debug_info->flag_names, debug_info->hash,
1804 debug_info->hash2);
1805 }
1806}
1807#else
1808static inline void dynamic_printk_setup(Elf_Shdr *sechdrs,
1809 unsigned int verboseindex)
1810{
1811}
1812#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
1813
1786static void *module_alloc_update_bounds(unsigned long size) 1814static void *module_alloc_update_bounds(unsigned long size)
1787{ 1815{
1788 void *ret = module_alloc(size); 1816 void *ret = module_alloc(size);
@@ -1831,6 +1859,7 @@ static noinline struct module *load_module(void __user *umod,
1831#endif 1859#endif
1832 unsigned int markersindex; 1860 unsigned int markersindex;
1833 unsigned int markersstringsindex; 1861 unsigned int markersstringsindex;
1862 unsigned int verboseindex;
1834 struct module *mod; 1863 struct module *mod;
1835 long err = 0; 1864 long err = 0;
1836 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1865 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -2117,6 +2146,7 @@ static noinline struct module *load_module(void __user *umod,
2117 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); 2146 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
2118 markersstringsindex = find_sec(hdr, sechdrs, secstrings, 2147 markersstringsindex = find_sec(hdr, sechdrs, secstrings,
2119 "__markers_strings"); 2148 "__markers_strings");
2149 verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose");
2120 2150
2121 /* Now do relocations. */ 2151 /* Now do relocations. */
2122 for (i = 1; i < hdr->e_shnum; i++) { 2152 for (i = 1; i < hdr->e_shnum; i++) {
@@ -2167,6 +2197,7 @@ static noinline struct module *load_module(void __user *umod,
2167 marker_update_probe_range(mod->markers, 2197 marker_update_probe_range(mod->markers,
2168 mod->markers + mod->num_markers); 2198 mod->markers + mod->num_markers);
2169#endif 2199#endif
2200 dynamic_printk_setup(sechdrs, verboseindex);
2170 err = module_finalize(hdr, sechdrs, mod); 2201 err = module_finalize(hdr, sechdrs, mod);
2171 if (err < 0) 2202 if (err < 0)
2172 goto cleanup; 2203 goto cleanup;
@@ -2552,9 +2583,9 @@ static char *module_flags(struct module *mod, char *buf)
2552 mod->state == MODULE_STATE_GOING || 2583 mod->state == MODULE_STATE_GOING ||
2553 mod->state == MODULE_STATE_COMING) { 2584 mod->state == MODULE_STATE_COMING) {
2554 buf[bx++] = '('; 2585 buf[bx++] = '(';
2555 if (mod->taints & TAINT_PROPRIETARY_MODULE) 2586 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
2556 buf[bx++] = 'P'; 2587 buf[bx++] = 'P';
2557 if (mod->taints & TAINT_FORCED_MODULE) 2588 if (mod->taints & (1 << TAINT_FORCED_MODULE))
2558 buf[bx++] = 'F'; 2589 buf[bx++] = 'F';
2559 /* 2590 /*
2560 * TAINT_FORCED_RMMOD: could be added. 2591 * TAINT_FORCED_RMMOD: could be added.
diff --git a/kernel/panic.c b/kernel/panic.c
index 12c5a0a6c89b..f290e8e866f6 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,7 +23,7 @@
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24 24
25int panic_on_oops; 25int panic_on_oops;
26int tainted; 26static unsigned long tainted_mask;
27static int pause_on_oops; 27static int pause_on_oops;
28static int pause_on_oops_flag; 28static int pause_on_oops_flag;
29static DEFINE_SPINLOCK(pause_on_oops_lock); 29static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -143,6 +143,26 @@ NORET_TYPE void panic(const char * fmt, ...)
143 143
144EXPORT_SYMBOL(panic); 144EXPORT_SYMBOL(panic);
145 145
146
147struct tnt {
148 u8 bit;
149 char true;
150 char false;
151};
152
153static const struct tnt tnts[] = {
154 { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
155 { TAINT_FORCED_MODULE, 'F', ' ' },
156 { TAINT_UNSAFE_SMP, 'S', ' ' },
157 { TAINT_FORCED_RMMOD, 'R', ' ' },
158 { TAINT_MACHINE_CHECK, 'M', ' ' },
159 { TAINT_BAD_PAGE, 'B', ' ' },
160 { TAINT_USER, 'U', ' ' },
161 { TAINT_DIE, 'D', ' ' },
162 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
163 { TAINT_WARN, 'W', ' ' },
164};
165
146/** 166/**
147 * print_tainted - return a string to represent the kernel taint state. 167 * print_tainted - return a string to represent the kernel taint state.
148 * 168 *
@@ -158,32 +178,41 @@ EXPORT_SYMBOL(panic);
158 * 178 *
159 * The string is overwritten by the next call to print_taint(). 179 * The string is overwritten by the next call to print_taint().
160 */ 180 */
161
162const char *print_tainted(void) 181const char *print_tainted(void)
163{ 182{
164 static char buf[20]; 183 static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1];
165 if (tainted) { 184
166 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", 185 if (tainted_mask) {
167 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 186 char *s;
168 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 187 int i;
169 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 188
170 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', 189 s = buf + sprintf(buf, "Tainted: ");
171 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', 190 for (i = 0; i < ARRAY_SIZE(tnts); i++) {
172 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 191 const struct tnt *t = &tnts[i];
173 tainted & TAINT_USER ? 'U' : ' ', 192 *s++ = test_bit(t->bit, &tainted_mask) ?
174 tainted & TAINT_DIE ? 'D' : ' ', 193 t->true : t->false;
175 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', 194 }
176 tainted & TAINT_WARN ? 'W' : ' '); 195 *s = 0;
177 } 196 } else
178 else
179 snprintf(buf, sizeof(buf), "Not tainted"); 197 snprintf(buf, sizeof(buf), "Not tainted");
180 return(buf); 198 return(buf);
181} 199}
182 200
201int test_taint(unsigned flag)
202{
203 return test_bit(flag, &tainted_mask);
204}
205EXPORT_SYMBOL(test_taint);
206
207unsigned long get_taint(void)
208{
209 return tainted_mask;
210}
211
183void add_taint(unsigned flag) 212void add_taint(unsigned flag)
184{ 213{
185 debug_locks = 0; /* can't trust the integrity of the kernel anymore */ 214 debug_locks = 0; /* can't trust the integrity of the kernel anymore */
186 tainted |= flag; 215 set_bit(flag, &tainted_mask);
187} 216}
188EXPORT_SYMBOL(add_taint); 217EXPORT_SYMBOL(add_taint);
189 218
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index f85efcdcab2d..ee204586149a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -441,7 +441,7 @@ static struct k_itimer * alloc_posix_timer(void)
441 return tmr; 441 return tmr;
442 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { 442 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
443 kmem_cache_free(posix_timers_cache, tmr); 443 kmem_cache_free(posix_timers_cache, tmr);
444 tmr = NULL; 444 return NULL;
445 } 445 }
446 memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); 446 memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
447 return tmr; 447 return tmr;
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index bbd85c60f741..331f9836383f 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -14,6 +14,7 @@
14#include <linux/reboot.h> 14#include <linux/reboot.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/device.h> 16#include <linux/device.h>
17#include <linux/kmod.h>
17#include <linux/delay.h> 18#include <linux/delay.h>
18#include <linux/fs.h> 19#include <linux/fs.h>
19#include <linux/mount.h> 20#include <linux/mount.h>
@@ -520,6 +521,10 @@ int hibernate(void)
520 if (error) 521 if (error)
521 goto Exit; 522 goto Exit;
522 523
524 error = usermodehelper_disable();
525 if (error)
526 goto Exit;
527
523 /* Allocate memory management structures */ 528 /* Allocate memory management structures */
524 error = create_basic_memory_bitmaps(); 529 error = create_basic_memory_bitmaps();
525 if (error) 530 if (error)
@@ -558,6 +563,7 @@ int hibernate(void)
558 thaw_processes(); 563 thaw_processes();
559 Finish: 564 Finish:
560 free_basic_memory_bitmaps(); 565 free_basic_memory_bitmaps();
566 usermodehelper_enable();
561 Exit: 567 Exit:
562 pm_notifier_call_chain(PM_POST_HIBERNATION); 568 pm_notifier_call_chain(PM_POST_HIBERNATION);
563 pm_restore_console(); 569 pm_restore_console();
@@ -634,6 +640,10 @@ static int software_resume(void)
634 if (error) 640 if (error)
635 goto Finish; 641 goto Finish;
636 642
643 error = usermodehelper_disable();
644 if (error)
645 goto Finish;
646
637 error = create_basic_memory_bitmaps(); 647 error = create_basic_memory_bitmaps();
638 if (error) 648 if (error)
639 goto Finish; 649 goto Finish;
@@ -656,6 +666,7 @@ static int software_resume(void)
656 thaw_processes(); 666 thaw_processes();
657 Done: 667 Done:
658 free_basic_memory_bitmaps(); 668 free_basic_memory_bitmaps();
669 usermodehelper_enable();
659 Finish: 670 Finish:
660 pm_notifier_call_chain(PM_POST_RESTORE); 671 pm_notifier_call_chain(PM_POST_RESTORE);
661 pm_restore_console(); 672 pm_restore_console();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 540b16b68565..19122cf6d827 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -14,6 +14,7 @@
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/delay.h> 15#include <linux/delay.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/kmod.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/console.h> 19#include <linux/console.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
@@ -237,6 +238,10 @@ static int suspend_prepare(void)
237 if (error) 238 if (error)
238 goto Finish; 239 goto Finish;
239 240
241 error = usermodehelper_disable();
242 if (error)
243 goto Finish;
244
240 if (suspend_freeze_processes()) { 245 if (suspend_freeze_processes()) {
241 error = -EAGAIN; 246 error = -EAGAIN;
242 goto Thaw; 247 goto Thaw;
@@ -256,6 +261,7 @@ static int suspend_prepare(void)
256 261
257 Thaw: 262 Thaw:
258 suspend_thaw_processes(); 263 suspend_thaw_processes();
264 usermodehelper_enable();
259 Finish: 265 Finish:
260 pm_notifier_call_chain(PM_POST_SUSPEND); 266 pm_notifier_call_chain(PM_POST_SUSPEND);
261 pm_restore_console(); 267 pm_restore_console();
@@ -376,6 +382,7 @@ int suspend_devices_and_enter(suspend_state_t state)
376static void suspend_finish(void) 382static void suspend_finish(void)
377{ 383{
378 suspend_thaw_processes(); 384 suspend_thaw_processes();
385 usermodehelper_enable();
379 pm_notifier_call_chain(PM_POST_SUSPEND); 386 pm_notifier_call_chain(PM_POST_SUSPEND);
380 pm_restore_console(); 387 pm_restore_console();
381} 388}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a6332a313262..005b93d839ba 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -212,13 +212,20 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
212 case SNAPSHOT_FREEZE: 212 case SNAPSHOT_FREEZE:
213 if (data->frozen) 213 if (data->frozen)
214 break; 214 break;
215
215 printk("Syncing filesystems ... "); 216 printk("Syncing filesystems ... ");
216 sys_sync(); 217 sys_sync();
217 printk("done.\n"); 218 printk("done.\n");
218 219
219 error = freeze_processes(); 220 error = usermodehelper_disable();
220 if (error) 221 if (error)
222 break;
223
224 error = freeze_processes();
225 if (error) {
221 thaw_processes(); 226 thaw_processes();
227 usermodehelper_enable();
228 }
222 if (!error) 229 if (!error)
223 data->frozen = 1; 230 data->frozen = 1;
224 break; 231 break;
@@ -227,6 +234,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
227 if (!data->frozen || data->ready) 234 if (!data->frozen || data->ready)
228 break; 235 break;
229 thaw_processes(); 236 thaw_processes();
237 usermodehelper_enable();
230 data->frozen = 0; 238 data->frozen = 0;
231 break; 239 break;
232 240
diff --git a/kernel/printk.c b/kernel/printk.c
index b51b1567bb55..6341af77eb65 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -13,7 +13,7 @@
13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul 13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
14 * manfred@colorfullife.com 14 * manfred@colorfullife.com
15 * Rewrote bits to get rid of console_lock 15 * Rewrote bits to get rid of console_lock
16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au> 16 * 01Mar01 Andrew Morton
17 */ 17 */
18 18
19#include <linux/kernel.h> 19#include <linux/kernel.h>
@@ -577,9 +577,6 @@ static int have_callable_console(void)
577 * @fmt: format string 577 * @fmt: format string
578 * 578 *
579 * This is printk(). It can be called from any context. We want it to work. 579 * This is printk(). It can be called from any context. We want it to work.
580 * Be aware of the fact that if oops_in_progress is not set, we might try to
581 * wake klogd up which could deadlock on runqueue lock if printk() is called
582 * from scheduler code.
583 * 580 *
584 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 581 * We try to grab the console_sem. If we succeed, it's easy - we log the output and
585 * call the console drivers. If we fail to get the semaphore we place the output 582 * call the console drivers. If we fail to get the semaphore we place the output
@@ -593,6 +590,8 @@ static int have_callable_console(void)
593 * 590 *
594 * See also: 591 * See also:
595 * printf(3) 592 * printf(3)
593 *
594 * See the vsnprintf() documentation for format string extensions over C99.
596 */ 595 */
597 596
598asmlinkage int printk(const char *fmt, ...) 597asmlinkage int printk(const char *fmt, ...)
@@ -982,10 +981,25 @@ int is_console_locked(void)
982 return console_locked; 981 return console_locked;
983} 982}
984 983
985void wake_up_klogd(void) 984static DEFINE_PER_CPU(int, printk_pending);
985
986void printk_tick(void)
986{ 987{
987 if (!oops_in_progress && waitqueue_active(&log_wait)) 988 if (__get_cpu_var(printk_pending)) {
989 __get_cpu_var(printk_pending) = 0;
988 wake_up_interruptible(&log_wait); 990 wake_up_interruptible(&log_wait);
991 }
992}
993
994int printk_needs_cpu(int cpu)
995{
996 return per_cpu(printk_pending, cpu);
997}
998
999void wake_up_klogd(void)
1000{
1001 if (waitqueue_active(&log_wait))
1002 __raw_get_cpu_var(printk_pending) = 1;
989} 1003}
990 1004
991/** 1005/**
@@ -1291,22 +1305,6 @@ static int __init disable_boot_consoles(void)
1291} 1305}
1292late_initcall(disable_boot_consoles); 1306late_initcall(disable_boot_consoles);
1293 1307
1294/**
1295 * tty_write_message - write a message to a certain tty, not just the console.
1296 * @tty: the destination tty_struct
1297 * @msg: the message to write
1298 *
1299 * This is used for messages that need to be redirected to a specific tty.
1300 * We don't put it into the syslog queue right now maybe in the future if
1301 * really needed.
1302 */
1303void tty_write_message(struct tty_struct *tty, char *msg)
1304{
1305 if (tty && tty->ops->write)
1306 tty->ops->write(tty, msg, strlen(msg));
1307 return;
1308}
1309
1310#if defined CONFIG_PRINTK 1308#if defined CONFIG_PRINTK
1311 1309
1312/* 1310/*
diff --git a/kernel/profile.c b/kernel/profile.c
index cd26bed4cc26..a9e422df6bf6 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -22,6 +22,8 @@
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/slab.h>
26#include <linux/vmalloc.h>
25#include <asm/sections.h> 27#include <asm/sections.h>
26#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
27#include <asm/ptrace.h> 29#include <asm/ptrace.h>
@@ -50,11 +52,11 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
50static DEFINE_MUTEX(profile_flip_mutex); 52static DEFINE_MUTEX(profile_flip_mutex);
51#endif /* CONFIG_SMP */ 53#endif /* CONFIG_SMP */
52 54
53static int __init profile_setup(char *str) 55int profile_setup(char *str)
54{ 56{
55 static char __initdata schedstr[] = "schedule"; 57 static char schedstr[] = "schedule";
56 static char __initdata sleepstr[] = "sleep"; 58 static char sleepstr[] = "sleep";
57 static char __initdata kvmstr[] = "kvm"; 59 static char kvmstr[] = "kvm";
58 int par; 60 int par;
59 61
60 if (!strncmp(str, sleepstr, strlen(sleepstr))) { 62 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
@@ -100,14 +102,33 @@ static int __init profile_setup(char *str)
100__setup("profile=", profile_setup); 102__setup("profile=", profile_setup);
101 103
102 104
103void __init profile_init(void) 105int profile_init(void)
104{ 106{
107 int buffer_bytes;
105 if (!prof_on) 108 if (!prof_on)
106 return; 109 return 0;
107 110
108 /* only text is profiled */ 111 /* only text is profiled */
109 prof_len = (_etext - _stext) >> prof_shift; 112 prof_len = (_etext - _stext) >> prof_shift;
110 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes);
116 return 0;
117 }
118
119 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
120 if (prof_buffer)
121 return 0;
122
123 prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO);
124 if (prof_buffer)
125 return 0;
126
127 prof_buffer = vmalloc(buffer_bytes);
128 if (prof_buffer)
129 return 0;
130
131 return -ENOMEM;
111} 132}
112 133
113/* Profile event notifications */ 134/* Profile event notifications */
@@ -527,7 +548,7 @@ static void __init profile_nop(void *unused)
527{ 548{
528} 549}
529 550
530static int __init create_hash_tables(void) 551static int create_hash_tables(void)
531{ 552{
532 int cpu; 553 int cpu;
533 554
@@ -575,14 +596,14 @@ out_cleanup:
575#define create_hash_tables() ({ 0; }) 596#define create_hash_tables() ({ 0; })
576#endif 597#endif
577 598
578static int __init create_proc_profile(void) 599int create_proc_profile(void)
579{ 600{
580 struct proc_dir_entry *entry; 601 struct proc_dir_entry *entry;
581 602
582 if (!prof_on) 603 if (!prof_on)
583 return 0; 604 return 0;
584 if (create_hash_tables()) 605 if (create_hash_tables())
585 return -1; 606 return -ENOMEM;
586 entry = proc_create("profile", S_IWUSR | S_IRUGO, 607 entry = proc_create("profile", S_IWUSR | S_IRUGO,
587 NULL, &proc_profile_operations); 608 NULL, &proc_profile_operations);
588 if (!entry) 609 if (!entry)
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index aad93cdc9f68..37f72e551542 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -47,6 +47,7 @@
47#include <linux/notifier.h> 47#include <linux/notifier.h>
48#include <linux/cpu.h> 48#include <linux/cpu.h>
49#include <linux/mutex.h> 49#include <linux/mutex.h>
50#include <linux/time.h>
50 51
51#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
52static struct lock_class_key rcu_lock_key; 53static struct lock_class_key rcu_lock_key;
@@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
60static struct rcu_ctrlblk rcu_ctrlblk = { 61static struct rcu_ctrlblk rcu_ctrlblk = {
61 .cur = -300, 62 .cur = -300,
62 .completed = -300, 63 .completed = -300,
64 .pending = -300,
63 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
64 .cpumask = CPU_MASK_NONE, 66 .cpumask = CPU_MASK_NONE,
65}; 67};
66static struct rcu_ctrlblk rcu_bh_ctrlblk = { 68static struct rcu_ctrlblk rcu_bh_ctrlblk = {
67 .cur = -300, 69 .cur = -300,
68 .completed = -300, 70 .completed = -300,
71 .pending = -300,
69 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), 72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
70 .cpumask = CPU_MASK_NONE, 73 .cpumask = CPU_MASK_NONE,
71}; 74};
@@ -83,7 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
83{ 86{
84 int cpu; 87 int cpu;
85 cpumask_t cpumask; 88 cpumask_t cpumask;
89 unsigned long flags;
90
86 set_need_resched(); 91 set_need_resched();
92 spin_lock_irqsave(&rcp->lock, flags);
87 if (unlikely(!rcp->signaled)) { 93 if (unlikely(!rcp->signaled)) {
88 rcp->signaled = 1; 94 rcp->signaled = 1;
89 /* 95 /*
@@ -109,6 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
109 for_each_cpu_mask_nr(cpu, cpumask) 115 for_each_cpu_mask_nr(cpu, cpumask)
110 smp_send_reschedule(cpu); 116 smp_send_reschedule(cpu);
111 } 117 }
118 spin_unlock_irqrestore(&rcp->lock, flags);
112} 119}
113#else 120#else
114static inline void force_quiescent_state(struct rcu_data *rdp, 121static inline void force_quiescent_state(struct rcu_data *rdp,
@@ -118,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp,
118} 125}
119#endif 126#endif
120 127
128static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
129 struct rcu_data *rdp)
130{
131 long batch;
132
133 head->next = NULL;
134 smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
135
136 /*
137 * Determine the batch number of this callback.
138 *
139 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
140 * local variable "batch" and emits codes like this:
141 * 1) rdp->batch = rcp->cur + 1 # gets old value
142 * ......
143 * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
144 * then [*nxttail[0], *nxttail[1]) may contain callbacks
145 * that batch# = rdp->batch, see the comment of struct rcu_data.
146 */
147 batch = ACCESS_ONCE(rcp->cur) + 1;
148
149 if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
150 /* process callbacks */
151 rdp->nxttail[0] = rdp->nxttail[1];
152 rdp->nxttail[1] = rdp->nxttail[2];
153 if (rcu_batch_after(batch - 1, rdp->batch))
154 rdp->nxttail[0] = rdp->nxttail[2];
155 }
156
157 rdp->batch = batch;
158 *rdp->nxttail[2] = head;
159 rdp->nxttail[2] = &head->next;
160
161 if (unlikely(++rdp->qlen > qhimark)) {
162 rdp->blimit = INT_MAX;
163 force_quiescent_state(rdp, &rcu_ctrlblk);
164 }
165}
166
167#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
168
169static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
170{
171 rcp->gp_start = jiffies;
172 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
173}
174
175static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
176{
177 int cpu;
178 long delta;
179 unsigned long flags;
180
181 /* Only let one CPU complain about others per time interval. */
182
183 spin_lock_irqsave(&rcp->lock, flags);
184 delta = jiffies - rcp->jiffies_stall;
185 if (delta < 2 || rcp->cur != rcp->completed) {
186 spin_unlock_irqrestore(&rcp->lock, flags);
187 return;
188 }
189 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
190 spin_unlock_irqrestore(&rcp->lock, flags);
191
192 /* OK, time to rat on our buddy... */
193
194 printk(KERN_ERR "RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask))
197 printk(" %d", cpu);
198 }
199 printk(" (detected by %d, t=%ld jiffies)\n",
200 smp_processor_id(), (long)(jiffies - rcp->gp_start));
201}
202
203static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{
205 unsigned long flags;
206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start);
210 dump_stack();
211 spin_lock_irqsave(&rcp->lock, flags);
212 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
213 rcp->jiffies_stall =
214 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
215 spin_unlock_irqrestore(&rcp->lock, flags);
216 set_need_resched(); /* kick ourselves to get things going. */
217}
218
219static void check_cpu_stall(struct rcu_ctrlblk *rcp)
220{
221 long delta;
222
223 delta = jiffies - rcp->jiffies_stall;
224 if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
225
226 /* We haven't checked in, so go dump stack. */
227 print_cpu_stall(rcp);
228
229 } else if (rcp->cur != rcp->completed && delta >= 2) {
230
231 /* They had two seconds to dump stack, so complain. */
232 print_other_cpu_stall(rcp);
233 }
234}
235
236#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237
238static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
239{
240}
241
242static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
243{
244}
245
246#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
247
121/** 248/**
122 * call_rcu - Queue an RCU callback for invocation after a grace period. 249 * call_rcu - Queue an RCU callback for invocation after a grace period.
123 * @head: structure to be used for queueing the RCU updates. 250 * @head: structure to be used for queueing the RCU updates.
@@ -133,18 +260,10 @@ void call_rcu(struct rcu_head *head,
133 void (*func)(struct rcu_head *rcu)) 260 void (*func)(struct rcu_head *rcu))
134{ 261{
135 unsigned long flags; 262 unsigned long flags;
136 struct rcu_data *rdp;
137 263
138 head->func = func; 264 head->func = func;
139 head->next = NULL;
140 local_irq_save(flags); 265 local_irq_save(flags);
141 rdp = &__get_cpu_var(rcu_data); 266 __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
142 *rdp->nxttail = head;
143 rdp->nxttail = &head->next;
144 if (unlikely(++rdp->qlen > qhimark)) {
145 rdp->blimit = INT_MAX;
146 force_quiescent_state(rdp, &rcu_ctrlblk);
147 }
148 local_irq_restore(flags); 267 local_irq_restore(flags);
149} 268}
150EXPORT_SYMBOL_GPL(call_rcu); 269EXPORT_SYMBOL_GPL(call_rcu);
@@ -169,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head,
169 void (*func)(struct rcu_head *rcu)) 288 void (*func)(struct rcu_head *rcu))
170{ 289{
171 unsigned long flags; 290 unsigned long flags;
172 struct rcu_data *rdp;
173 291
174 head->func = func; 292 head->func = func;
175 head->next = NULL;
176 local_irq_save(flags); 293 local_irq_save(flags);
177 rdp = &__get_cpu_var(rcu_bh_data); 294 __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
178 *rdp->nxttail = head;
179 rdp->nxttail = &head->next;
180
181 if (unlikely(++rdp->qlen > qhimark)) {
182 rdp->blimit = INT_MAX;
183 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
184 }
185
186 local_irq_restore(flags); 295 local_irq_restore(flags);
187} 296}
188EXPORT_SYMBOL_GPL(call_rcu_bh); 297EXPORT_SYMBOL_GPL(call_rcu_bh);
@@ -211,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
211static inline void raise_rcu_softirq(void) 320static inline void raise_rcu_softirq(void)
212{ 321{
213 raise_softirq(RCU_SOFTIRQ); 322 raise_softirq(RCU_SOFTIRQ);
214 /*
215 * The smp_mb() here is required to ensure that this cpu's
216 * __rcu_process_callbacks() reads the most recently updated
217 * value of rcu->cur.
218 */
219 smp_mb();
220} 323}
221 324
222/* 325/*
@@ -225,6 +328,7 @@ static inline void raise_rcu_softirq(void)
225 */ 328 */
226static void rcu_do_batch(struct rcu_data *rdp) 329static void rcu_do_batch(struct rcu_data *rdp)
227{ 330{
331 unsigned long flags;
228 struct rcu_head *next, *list; 332 struct rcu_head *next, *list;
229 int count = 0; 333 int count = 0;
230 334
@@ -239,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
239 } 343 }
240 rdp->donelist = list; 344 rdp->donelist = list;
241 345
242 local_irq_disable(); 346 local_irq_save(flags);
243 rdp->qlen -= count; 347 rdp->qlen -= count;
244 local_irq_enable(); 348 local_irq_restore(flags);
245 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) 349 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
246 rdp->blimit = blimit; 350 rdp->blimit = blimit;
247 351
@@ -269,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
269 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace 373 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
270 * period (if necessary). 374 * period (if necessary).
271 */ 375 */
376
272/* 377/*
273 * Register a new batch of callbacks, and start it up if there is currently no 378 * Register a new batch of callbacks, and start it up if there is currently no
274 * active batch and the batch to be registered has not already occurred. 379 * active batch and the batch to be registered has not already occurred.
@@ -276,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp)
276 */ 381 */
277static void rcu_start_batch(struct rcu_ctrlblk *rcp) 382static void rcu_start_batch(struct rcu_ctrlblk *rcp)
278{ 383{
279 if (rcp->next_pending && 384 if (rcp->cur != rcp->pending &&
280 rcp->completed == rcp->cur) { 385 rcp->completed == rcp->cur) {
281 rcp->next_pending = 0;
282 /*
283 * next_pending == 0 must be visible in
284 * __rcu_process_callbacks() before it can see new value of cur.
285 */
286 smp_wmb();
287 rcp->cur++; 386 rcp->cur++;
387 record_gp_stall_check_time(rcp);
288 388
289 /* 389 /*
290 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a 390 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -322,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
322static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, 422static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
323 struct rcu_data *rdp) 423 struct rcu_data *rdp)
324{ 424{
425 unsigned long flags;
426
325 if (rdp->quiescbatch != rcp->cur) { 427 if (rdp->quiescbatch != rcp->cur) {
326 /* start new grace period: */ 428 /* start new grace period: */
327 rdp->qs_pending = 1; 429 rdp->qs_pending = 1;
@@ -345,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
345 return; 447 return;
346 rdp->qs_pending = 0; 448 rdp->qs_pending = 0;
347 449
348 spin_lock(&rcp->lock); 450 spin_lock_irqsave(&rcp->lock, flags);
349 /* 451 /*
350 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync 452 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
351 * during cpu startup. Ignore the quiescent state. 453 * during cpu startup. Ignore the quiescent state.
@@ -353,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
353 if (likely(rdp->quiescbatch == rcp->cur)) 455 if (likely(rdp->quiescbatch == rcp->cur))
354 cpu_quiet(rdp->cpu, rcp); 456 cpu_quiet(rdp->cpu, rcp);
355 457
356 spin_unlock(&rcp->lock); 458 spin_unlock_irqrestore(&rcp->lock, flags);
357} 459}
358 460
359 461
@@ -364,33 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
364 * which is dead and hence not processing interrupts. 466 * which is dead and hence not processing interrupts.
365 */ 467 */
366static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, 468static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
367 struct rcu_head **tail) 469 struct rcu_head **tail, long batch)
368{ 470{
369 local_irq_disable(); 471 unsigned long flags;
370 *this_rdp->nxttail = list; 472
371 if (list) 473 if (list) {
372 this_rdp->nxttail = tail; 474 local_irq_save(flags);
373 local_irq_enable(); 475 this_rdp->batch = batch;
476 *this_rdp->nxttail[2] = list;
477 this_rdp->nxttail[2] = tail;
478 local_irq_restore(flags);
479 }
374} 480}
375 481
376static void __rcu_offline_cpu(struct rcu_data *this_rdp, 482static void __rcu_offline_cpu(struct rcu_data *this_rdp,
377 struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 483 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
378{ 484{
379 /* if the cpu going offline owns the grace period 485 unsigned long flags;
486
487 /*
488 * if the cpu going offline owns the grace period
380 * we can block indefinitely waiting for it, so flush 489 * we can block indefinitely waiting for it, so flush
381 * it here 490 * it here
382 */ 491 */
383 spin_lock_bh(&rcp->lock); 492 spin_lock_irqsave(&rcp->lock, flags);
384 if (rcp->cur != rcp->completed) 493 if (rcp->cur != rcp->completed)
385 cpu_quiet(rdp->cpu, rcp); 494 cpu_quiet(rdp->cpu, rcp);
386 spin_unlock_bh(&rcp->lock); 495 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
387 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); 496 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
388 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); 497 spin_unlock(&rcp->lock);
389 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
390 498
391 local_irq_disable();
392 this_rdp->qlen += rdp->qlen; 499 this_rdp->qlen += rdp->qlen;
393 local_irq_enable(); 500 local_irq_restore(flags);
394} 501}
395 502
396static void rcu_offline_cpu(int cpu) 503static void rcu_offline_cpu(int cpu)
@@ -420,38 +527,52 @@ static void rcu_offline_cpu(int cpu)
420static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, 527static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
421 struct rcu_data *rdp) 528 struct rcu_data *rdp)
422{ 529{
423 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { 530 unsigned long flags;
424 *rdp->donetail = rdp->curlist; 531 long completed_snap;
425 rdp->donetail = rdp->curtail;
426 rdp->curlist = NULL;
427 rdp->curtail = &rdp->curlist;
428 }
429 532
430 if (rdp->nxtlist && !rdp->curlist) { 533 if (rdp->nxtlist) {
431 local_irq_disable(); 534 local_irq_save(flags);
432 rdp->curlist = rdp->nxtlist; 535 completed_snap = ACCESS_ONCE(rcp->completed);
433 rdp->curtail = rdp->nxttail;
434 rdp->nxtlist = NULL;
435 rdp->nxttail = &rdp->nxtlist;
436 local_irq_enable();
437 536
438 /* 537 /*
439 * start the next batch of callbacks 538 * move the other grace-period-completed entries to
539 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
440 */ 540 */
541 if (!rcu_batch_before(completed_snap, rdp->batch))
542 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
543 else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
544 rdp->nxttail[0] = rdp->nxttail[1];
441 545
442 /* determine batch number */ 546 /*
443 rdp->batch = rcp->cur + 1; 547 * the grace period for entries in
444 /* see the comment and corresponding wmb() in 548 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
445 * the rcu_start_batch() 549 * move these entries to donelist
446 */ 550 */
447 smp_rmb(); 551 if (rdp->nxttail[0] != &rdp->nxtlist) {
552 *rdp->donetail = rdp->nxtlist;
553 rdp->donetail = rdp->nxttail[0];
554 rdp->nxtlist = *rdp->nxttail[0];
555 *rdp->donetail = NULL;
556
557 if (rdp->nxttail[1] == rdp->nxttail[0])
558 rdp->nxttail[1] = &rdp->nxtlist;
559 if (rdp->nxttail[2] == rdp->nxttail[0])
560 rdp->nxttail[2] = &rdp->nxtlist;
561 rdp->nxttail[0] = &rdp->nxtlist;
562 }
563
564 local_irq_restore(flags);
565
566 if (rcu_batch_after(rdp->batch, rcp->pending)) {
567 unsigned long flags2;
448 568
449 if (!rcp->next_pending) {
450 /* and start it/schedule start if it's a new batch */ 569 /* and start it/schedule start if it's a new batch */
451 spin_lock(&rcp->lock); 570 spin_lock_irqsave(&rcp->lock, flags2);
452 rcp->next_pending = 1; 571 if (rcu_batch_after(rdp->batch, rcp->pending)) {
453 rcu_start_batch(rcp); 572 rcp->pending = rdp->batch;
454 spin_unlock(&rcp->lock); 573 rcu_start_batch(rcp);
574 }
575 spin_unlock_irqrestore(&rcp->lock, flags2);
455 } 576 }
456 } 577 }
457 578
@@ -462,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
462 583
463static void rcu_process_callbacks(struct softirq_action *unused) 584static void rcu_process_callbacks(struct softirq_action *unused)
464{ 585{
586 /*
587 * Memory references from any prior RCU read-side critical sections
588 * executed by the interrupted code must be see before any RCU
589 * grace-period manupulations below.
590 */
591
592 smp_mb(); /* See above block comment. */
593
465 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); 594 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
466 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); 595 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
596
597 /*
598 * Memory references from any later RCU read-side critical sections
599 * executed by the interrupted code must be see after any RCU
600 * grace-period manupulations above.
601 */
602
603 smp_mb(); /* See above block comment. */
467} 604}
468 605
469static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 606static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
470{ 607{
471 /* This cpu has pending rcu entries and the grace period 608 /* Check for CPU stalls, if enabled. */
472 * for them has completed. 609 check_cpu_stall(rcp);
473 */
474 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
475 return 1;
476 610
477 /* This cpu has no pending entries, but there are new entries */ 611 if (rdp->nxtlist) {
478 if (!rdp->curlist && rdp->nxtlist) 612 long completed_snap = ACCESS_ONCE(rcp->completed);
479 return 1; 613
614 /*
615 * This cpu has pending rcu entries and the grace period
616 * for them has completed.
617 */
618 if (!rcu_batch_before(completed_snap, rdp->batch))
619 return 1;
620 if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
621 rdp->nxttail[0] != rdp->nxttail[1])
622 return 1;
623 if (rdp->nxttail[0] != &rdp->nxtlist)
624 return 1;
625
626 /*
627 * This cpu has pending rcu entries and the new batch
628 * for then hasn't been started nor scheduled start
629 */
630 if (rcu_batch_after(rdp->batch, rcp->pending))
631 return 1;
632 }
480 633
481 /* This cpu has finished callbacks to invoke */ 634 /* This cpu has finished callbacks to invoke */
482 if (rdp->donelist) 635 if (rdp->donelist)
@@ -512,9 +665,15 @@ int rcu_needs_cpu(int cpu)
512 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 665 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
513 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); 666 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
514 667
515 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); 668 return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
516} 669}
517 670
671/*
672 * Top-level function driving RCU grace-period detection, normally
673 * invoked from the scheduler-clock interrupt. This function simply
674 * increments counters that are read only from softirq by this same
675 * CPU, so there are no memory barriers required.
676 */
518void rcu_check_callbacks(int cpu, int user) 677void rcu_check_callbacks(int cpu, int user)
519{ 678{
520 if (user || 679 if (user ||
@@ -558,14 +717,17 @@ void rcu_check_callbacks(int cpu, int user)
558static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, 717static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
559 struct rcu_data *rdp) 718 struct rcu_data *rdp)
560{ 719{
720 unsigned long flags;
721
722 spin_lock_irqsave(&rcp->lock, flags);
561 memset(rdp, 0, sizeof(*rdp)); 723 memset(rdp, 0, sizeof(*rdp));
562 rdp->curtail = &rdp->curlist; 724 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
563 rdp->nxttail = &rdp->nxtlist;
564 rdp->donetail = &rdp->donelist; 725 rdp->donetail = &rdp->donelist;
565 rdp->quiescbatch = rcp->completed; 726 rdp->quiescbatch = rcp->completed;
566 rdp->qs_pending = 0; 727 rdp->qs_pending = 0;
567 rdp->cpu = cpu; 728 rdp->cpu = cpu;
568 rdp->blimit = blimit; 729 rdp->blimit = blimit;
730 spin_unlock_irqrestore(&rcp->lock, flags);
569} 731}
570 732
571static void __cpuinit rcu_online_cpu(int cpu) 733static void __cpuinit rcu_online_cpu(int cpu)
@@ -610,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
610 */ 772 */
611void __init __rcu_init(void) 773void __init __rcu_init(void)
612{ 774{
775#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
776 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
777#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
613 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 778 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
614 (void *)(long)smp_processor_id()); 779 (void *)(long)smp_processor_id());
615 /* Register notifier for non-boot CPUs */ 780 /* Register notifier for non-boot CPUs */
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 27827931ca0d..ca4bbbe04aa4 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -59,14 +59,6 @@
59#include <linux/rcupreempt_trace.h> 59#include <linux/rcupreempt_trace.h>
60 60
61/* 61/*
62 * Macro that prevents the compiler from reordering accesses, but does
63 * absolutely -nothing- to prevent CPUs from reordering. This is used
64 * only to mediate communication between mainline code and hardware
65 * interrupt and NMI handlers.
66 */
67#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
68
69/*
70 * PREEMPT_RCU data structures. 62 * PREEMPT_RCU data structures.
71 */ 63 */
72 64
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 5edf82c34bbc..35c2d3360ecf 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -308,11 +308,16 @@ out:
308 308
309static int __init rcupreempt_trace_init(void) 309static int __init rcupreempt_trace_init(void)
310{ 310{
311 int ret;
312
311 mutex_init(&rcupreempt_trace_mutex); 313 mutex_init(&rcupreempt_trace_mutex);
312 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); 314 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
313 if (!rcupreempt_trace_buf) 315 if (!rcupreempt_trace_buf)
314 return 1; 316 return 1;
315 return rcupreempt_debugfs_init(); 317 ret = rcupreempt_debugfs_init();
318 if (ret)
319 kfree(rcupreempt_trace_buf);
320 return ret;
316} 321}
317 322
318static void __exit rcupreempt_trace_cleanup(void) 323static void __exit rcupreempt_trace_cleanup(void)
diff --git a/kernel/resource.c b/kernel/resource.c
index 03d796c1b2e9..4089d12af6e0 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -38,10 +38,6 @@ EXPORT_SYMBOL(iomem_resource);
38 38
39static DEFINE_RWLOCK(resource_lock); 39static DEFINE_RWLOCK(resource_lock);
40 40
41#ifdef CONFIG_PROC_FS
42
43enum { MAX_IORES_LEVEL = 5 };
44
45static void *r_next(struct seq_file *m, void *v, loff_t *pos) 41static void *r_next(struct seq_file *m, void *v, loff_t *pos)
46{ 42{
47 struct resource *p = v; 43 struct resource *p = v;
@@ -53,6 +49,10 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
53 return p->sibling; 49 return p->sibling;
54} 50}
55 51
52#ifdef CONFIG_PROC_FS
53
54enum { MAX_IORES_LEVEL = 5 };
55
56static void *r_start(struct seq_file *m, loff_t *pos) 56static void *r_start(struct seq_file *m, loff_t *pos)
57 __acquires(resource_lock) 57 __acquires(resource_lock)
58{ 58{
@@ -516,6 +516,70 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
516 return result; 516 return result;
517} 517}
518 518
519static void __init __reserve_region_with_split(struct resource *root,
520 resource_size_t start, resource_size_t end,
521 const char *name)
522{
523 struct resource *parent = root;
524 struct resource *conflict;
525 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
526
527 if (!res)
528 return;
529
530 res->name = name;
531 res->start = start;
532 res->end = end;
533 res->flags = IORESOURCE_BUSY;
534
535 for (;;) {
536 conflict = __request_resource(parent, res);
537 if (!conflict)
538 break;
539 if (conflict != parent) {
540 parent = conflict;
541 if (!(conflict->flags & IORESOURCE_BUSY))
542 continue;
543 }
544
545 /* Uhhuh, that didn't work out.. */
546 kfree(res);
547 res = NULL;
548 break;
549 }
550
551 if (!res) {
552 /* failed, split and try again */
553
554 /* conflict covered whole area */
555 if (conflict->start <= start && conflict->end >= end)
556 return;
557
558 if (conflict->start > start)
559 __reserve_region_with_split(root, start, conflict->start-1, name);
560 if (!(conflict->flags & IORESOURCE_BUSY)) {
561 resource_size_t common_start, common_end;
562
563 common_start = max(conflict->start, start);
564 common_end = min(conflict->end, end);
565 if (common_start < common_end)
566 __reserve_region_with_split(root, common_start, common_end, name);
567 }
568 if (conflict->end < end)
569 __reserve_region_with_split(root, conflict->end+1, end, name);
570 }
571
572}
573
574void reserve_region_with_split(struct resource *root,
575 resource_size_t start, resource_size_t end,
576 const char *name)
577{
578 write_lock(&resource_lock);
579 __reserve_region_with_split(root, start, end, name);
580 write_unlock(&resource_lock);
581}
582
519EXPORT_SYMBOL(adjust_resource); 583EXPORT_SYMBOL(adjust_resource);
520 584
521/** 585/**
@@ -562,33 +626,34 @@ struct resource * __request_region(struct resource *parent,
562{ 626{
563 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 627 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
564 628
565 if (res) { 629 if (!res)
566 res->name = name; 630 return NULL;
567 res->start = start;
568 res->end = start + n - 1;
569 res->flags = IORESOURCE_BUSY;
570 631
571 write_lock(&resource_lock); 632 res->name = name;
633 res->start = start;
634 res->end = start + n - 1;
635 res->flags = IORESOURCE_BUSY;
572 636
573 for (;;) { 637 write_lock(&resource_lock);
574 struct resource *conflict;
575 638
576 conflict = __request_resource(parent, res); 639 for (;;) {
577 if (!conflict) 640 struct resource *conflict;
578 break;
579 if (conflict != parent) {
580 parent = conflict;
581 if (!(conflict->flags & IORESOURCE_BUSY))
582 continue;
583 }
584 641
585 /* Uhhuh, that didn't work out.. */ 642 conflict = __request_resource(parent, res);
586 kfree(res); 643 if (!conflict)
587 res = NULL;
588 break; 644 break;
645 if (conflict != parent) {
646 parent = conflict;
647 if (!(conflict->flags & IORESOURCE_BUSY))
648 continue;
589 } 649 }
590 write_unlock(&resource_lock); 650
651 /* Uhhuh, that didn't work out.. */
652 kfree(res);
653 res = NULL;
654 break;
591 } 655 }
656 write_unlock(&resource_lock);
592 return res; 657 return res;
593} 658}
594EXPORT_SYMBOL(__request_region); 659EXPORT_SYMBOL(__request_region);
@@ -763,3 +828,40 @@ static int __init reserve_setup(char *str)
763} 828}
764 829
765__setup("reserve=", reserve_setup); 830__setup("reserve=", reserve_setup);
831
832/*
833 * Check if the requested addr and size spans more than any slot in the
834 * iomem resource tree.
835 */
836int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
837{
838 struct resource *p = &iomem_resource;
839 int err = 0;
840 loff_t l;
841
842 read_lock(&resource_lock);
843 for (p = p->child; p ; p = r_next(NULL, p, &l)) {
844 /*
845 * We can probably skip the resources without
846 * IORESOURCE_IO attribute?
847 */
848 if (p->start >= addr + size)
849 continue;
850 if (p->end < addr)
851 continue;
852 if (p->start <= addr && (p->end >= addr + size - 1))
853 continue;
854 printk(KERN_WARNING "resource map sanity check conflict: "
855 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
856 (unsigned long long)addr,
857 (unsigned long long)(addr + size - 1),
858 (unsigned long long)p->start,
859 (unsigned long long)p->end,
860 p->name);
861 err = -1;
862 break;
863 }
864 read_unlock(&resource_lock);
865
866 return err;
867}
diff --git a/kernel/sched.c b/kernel/sched.c
index e46b5afa200d..eb3c72953615 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
201 hrtimer_init(&rt_b->rt_period_timer, 201 hrtimer_init(&rt_b->rt_period_timer,
202 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 202 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
203 rt_b->rt_period_timer.function = sched_rt_period_timer; 203 rt_b->rt_period_timer.function = sched_rt_period_timer;
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205}
206
207static inline int rt_bandwidth_enabled(void)
208{
209 return sysctl_sched_rt_runtime >= 0;
205} 210}
206 211
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 212static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
208{ 213{
209 ktime_t now; 214 ktime_t now;
210 215
211 if (rt_b->rt_runtime == RUNTIME_INF) 216 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
212 return; 217 return;
213 218
214 if (hrtimer_active(&rt_b->rt_period_timer)) 219 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -297,9 +302,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
297static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 302static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
298static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 303static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
299#endif /* CONFIG_RT_GROUP_SCHED */ 304#endif /* CONFIG_RT_GROUP_SCHED */
300#else /* !CONFIG_FAIR_GROUP_SCHED */ 305#else /* !CONFIG_USER_SCHED */
301#define root_task_group init_task_group 306#define root_task_group init_task_group
302#endif /* CONFIG_FAIR_GROUP_SCHED */ 307#endif /* CONFIG_USER_SCHED */
303 308
304/* task_group_lock serializes add/remove of task groups and also changes to 309/* task_group_lock serializes add/remove of task groups and also changes to
305 * a task group's cpu shares. 310 * a task group's cpu shares.
@@ -603,9 +608,9 @@ struct rq {
603 608
604static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 609static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
605 610
606static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 611static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
607{ 612{
608 rq->curr->sched_class->check_preempt_curr(rq, p); 613 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
609} 614}
610 615
611static inline int cpu_of(struct rq *rq) 616static inline int cpu_of(struct rq *rq)
@@ -1086,7 +1091,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1086 return NOTIFY_DONE; 1091 return NOTIFY_DONE;
1087} 1092}
1088 1093
1089static void init_hrtick(void) 1094static __init void init_hrtick(void)
1090{ 1095{
1091 hotcpu_notifier(hotplug_hrtick, 0); 1096 hotcpu_notifier(hotplug_hrtick, 0);
1092} 1097}
@@ -1101,7 +1106,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1101 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1106 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1102} 1107}
1103 1108
1104static void init_hrtick(void) 1109static inline void init_hrtick(void)
1105{ 1110{
1106} 1111}
1107#endif /* CONFIG_SMP */ 1112#endif /* CONFIG_SMP */
@@ -1118,9 +1123,9 @@ static void init_rq_hrtick(struct rq *rq)
1118 1123
1119 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1124 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1120 rq->hrtick_timer.function = hrtick; 1125 rq->hrtick_timer.function = hrtick;
1121 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1126 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1122} 1127}
1123#else 1128#else /* CONFIG_SCHED_HRTICK */
1124static inline void hrtick_clear(struct rq *rq) 1129static inline void hrtick_clear(struct rq *rq)
1125{ 1130{
1126} 1131}
@@ -1132,7 +1137,7 @@ static inline void init_rq_hrtick(struct rq *rq)
1132static inline void init_hrtick(void) 1137static inline void init_hrtick(void)
1133{ 1138{
1134} 1139}
1135#endif 1140#endif /* CONFIG_SCHED_HRTICK */
1136 1141
1137/* 1142/*
1138 * resched_task - mark a task 'to be rescheduled now'. 1143 * resched_task - mark a task 'to be rescheduled now'.
@@ -1379,38 +1384,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1379 update_load_sub(&rq->load, load); 1384 update_load_sub(&rq->load, load);
1380} 1385}
1381 1386
1382#ifdef CONFIG_SMP 1387#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1383static unsigned long source_load(int cpu, int type); 1388typedef int (*tg_visitor)(struct task_group *, void *);
1384static unsigned long target_load(int cpu, int type);
1385static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1386
1387static unsigned long cpu_avg_load_per_task(int cpu)
1388{
1389 struct rq *rq = cpu_rq(cpu);
1390
1391 if (rq->nr_running)
1392 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1393
1394 return rq->avg_load_per_task;
1395}
1396
1397#ifdef CONFIG_FAIR_GROUP_SCHED
1398
1399typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1400 1389
1401/* 1390/*
1402 * Iterate the full tree, calling @down when first entering a node and @up when 1391 * Iterate the full tree, calling @down when first entering a node and @up when
1403 * leaving it for the final time. 1392 * leaving it for the final time.
1404 */ 1393 */
1405static void 1394static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1406walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1407{ 1395{
1408 struct task_group *parent, *child; 1396 struct task_group *parent, *child;
1397 int ret;
1409 1398
1410 rcu_read_lock(); 1399 rcu_read_lock();
1411 parent = &root_task_group; 1400 parent = &root_task_group;
1412down: 1401down:
1413 (*down)(parent, cpu, sd); 1402 ret = (*down)(parent, data);
1403 if (ret)
1404 goto out_unlock;
1414 list_for_each_entry_rcu(child, &parent->children, siblings) { 1405 list_for_each_entry_rcu(child, &parent->children, siblings) {
1415 parent = child; 1406 parent = child;
1416 goto down; 1407 goto down;
@@ -1418,15 +1409,43 @@ down:
1418up: 1409up:
1419 continue; 1410 continue;
1420 } 1411 }
1421 (*up)(parent, cpu, sd); 1412 ret = (*up)(parent, data);
1413 if (ret)
1414 goto out_unlock;
1422 1415
1423 child = parent; 1416 child = parent;
1424 parent = parent->parent; 1417 parent = parent->parent;
1425 if (parent) 1418 if (parent)
1426 goto up; 1419 goto up;
1420out_unlock:
1427 rcu_read_unlock(); 1421 rcu_read_unlock();
1422
1423 return ret;
1424}
1425
1426static int tg_nop(struct task_group *tg, void *data)
1427{
1428 return 0;
1429}
1430#endif
1431
1432#ifdef CONFIG_SMP
1433static unsigned long source_load(int cpu, int type);
1434static unsigned long target_load(int cpu, int type);
1435static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1436
1437static unsigned long cpu_avg_load_per_task(int cpu)
1438{
1439 struct rq *rq = cpu_rq(cpu);
1440
1441 if (rq->nr_running)
1442 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1443
1444 return rq->avg_load_per_task;
1428} 1445}
1429 1446
1447#ifdef CONFIG_FAIR_GROUP_SCHED
1448
1430static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1449static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1431 1450
1432/* 1451/*
@@ -1485,11 +1504,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1485 * This needs to be done in a bottom-up fashion because the rq weight of a 1504 * This needs to be done in a bottom-up fashion because the rq weight of a
1486 * parent group depends on the shares of its child groups. 1505 * parent group depends on the shares of its child groups.
1487 */ 1506 */
1488static void 1507static int tg_shares_up(struct task_group *tg, void *data)
1489tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1490{ 1508{
1491 unsigned long rq_weight = 0; 1509 unsigned long rq_weight = 0;
1492 unsigned long shares = 0; 1510 unsigned long shares = 0;
1511 struct sched_domain *sd = data;
1493 int i; 1512 int i;
1494 1513
1495 for_each_cpu_mask(i, sd->span) { 1514 for_each_cpu_mask(i, sd->span) {
@@ -1514,6 +1533,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1514 __update_group_shares_cpu(tg, i, shares, rq_weight); 1533 __update_group_shares_cpu(tg, i, shares, rq_weight);
1515 spin_unlock_irqrestore(&rq->lock, flags); 1534 spin_unlock_irqrestore(&rq->lock, flags);
1516 } 1535 }
1536
1537 return 0;
1517} 1538}
1518 1539
1519/* 1540/*
@@ -1521,10 +1542,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1521 * This needs to be done in a top-down fashion because the load of a child 1542 * This needs to be done in a top-down fashion because the load of a child
1522 * group is a fraction of its parents load. 1543 * group is a fraction of its parents load.
1523 */ 1544 */
1524static void 1545static int tg_load_down(struct task_group *tg, void *data)
1525tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1526{ 1546{
1527 unsigned long load; 1547 unsigned long load;
1548 long cpu = (long)data;
1528 1549
1529 if (!tg->parent) { 1550 if (!tg->parent) {
1530 load = cpu_rq(cpu)->load.weight; 1551 load = cpu_rq(cpu)->load.weight;
@@ -1535,11 +1556,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1535 } 1556 }
1536 1557
1537 tg->cfs_rq[cpu]->h_load = load; 1558 tg->cfs_rq[cpu]->h_load = load;
1538}
1539 1559
1540static void 1560 return 0;
1541tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1542{
1543} 1561}
1544 1562
1545static void update_shares(struct sched_domain *sd) 1563static void update_shares(struct sched_domain *sd)
@@ -1549,7 +1567,7 @@ static void update_shares(struct sched_domain *sd)
1549 1567
1550 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1568 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1551 sd->last_update = now; 1569 sd->last_update = now;
1552 walk_tg_tree(tg_nop, tg_shares_up, 0, sd); 1570 walk_tg_tree(tg_nop, tg_shares_up, sd);
1553 } 1571 }
1554} 1572}
1555 1573
@@ -1560,9 +1578,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1560 spin_lock(&rq->lock); 1578 spin_lock(&rq->lock);
1561} 1579}
1562 1580
1563static void update_h_load(int cpu) 1581static void update_h_load(long cpu)
1564{ 1582{
1565 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); 1583 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1566} 1584}
1567 1585
1568#else 1586#else
@@ -1920,11 +1938,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1920 running = task_running(rq, p); 1938 running = task_running(rq, p);
1921 on_rq = p->se.on_rq; 1939 on_rq = p->se.on_rq;
1922 ncsw = 0; 1940 ncsw = 0;
1923 if (!match_state || p->state == match_state) { 1941 if (!match_state || p->state == match_state)
1924 ncsw = p->nivcsw + p->nvcsw; 1942 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1925 if (unlikely(!ncsw))
1926 ncsw = 1;
1927 }
1928 task_rq_unlock(rq, &flags); 1943 task_rq_unlock(rq, &flags);
1929 1944
1930 /* 1945 /*
@@ -2284,7 +2299,7 @@ out_running:
2284 trace_mark(kernel_sched_wakeup, 2299 trace_mark(kernel_sched_wakeup,
2285 "pid %d state %ld ## rq %p task %p rq->curr %p", 2300 "pid %d state %ld ## rq %p task %p rq->curr %p",
2286 p->pid, p->state, rq, p, rq->curr); 2301 p->pid, p->state, rq, p, rq->curr);
2287 check_preempt_curr(rq, p); 2302 check_preempt_curr(rq, p, sync);
2288 2303
2289 p->state = TASK_RUNNING; 2304 p->state = TASK_RUNNING;
2290#ifdef CONFIG_SMP 2305#ifdef CONFIG_SMP
@@ -2419,7 +2434,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2419 trace_mark(kernel_sched_wakeup_new, 2434 trace_mark(kernel_sched_wakeup_new,
2420 "pid %d state %ld ## rq %p task %p rq->curr %p", 2435 "pid %d state %ld ## rq %p task %p rq->curr %p",
2421 p->pid, p->state, rq, p, rq->curr); 2436 p->pid, p->state, rq, p, rq->curr);
2422 check_preempt_curr(rq, p); 2437 check_preempt_curr(rq, p, 0);
2423#ifdef CONFIG_SMP 2438#ifdef CONFIG_SMP
2424 if (p->sched_class->task_wake_up) 2439 if (p->sched_class->task_wake_up)
2425 p->sched_class->task_wake_up(rq, p); 2440 p->sched_class->task_wake_up(rq, p);
@@ -2879,7 +2894,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2879 * Note that idle threads have a prio of MAX_PRIO, for this test 2894 * Note that idle threads have a prio of MAX_PRIO, for this test
2880 * to be always true for them. 2895 * to be always true for them.
2881 */ 2896 */
2882 check_preempt_curr(this_rq, p); 2897 check_preempt_curr(this_rq, p, 0);
2883} 2898}
2884 2899
2885/* 2900/*
@@ -4626,6 +4641,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4626} 4641}
4627EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4642EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4628 4643
4644/**
4645 * complete: - signals a single thread waiting on this completion
4646 * @x: holds the state of this particular completion
4647 *
4648 * This will wake up a single thread waiting on this completion. Threads will be
4649 * awakened in the same order in which they were queued.
4650 *
4651 * See also complete_all(), wait_for_completion() and related routines.
4652 */
4629void complete(struct completion *x) 4653void complete(struct completion *x)
4630{ 4654{
4631 unsigned long flags; 4655 unsigned long flags;
@@ -4637,6 +4661,12 @@ void complete(struct completion *x)
4637} 4661}
4638EXPORT_SYMBOL(complete); 4662EXPORT_SYMBOL(complete);
4639 4663
4664/**
4665 * complete_all: - signals all threads waiting on this completion
4666 * @x: holds the state of this particular completion
4667 *
4668 * This will wake up all threads waiting on this particular completion event.
4669 */
4640void complete_all(struct completion *x) 4670void complete_all(struct completion *x)
4641{ 4671{
4642 unsigned long flags; 4672 unsigned long flags;
@@ -4657,10 +4687,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4657 wait.flags |= WQ_FLAG_EXCLUSIVE; 4687 wait.flags |= WQ_FLAG_EXCLUSIVE;
4658 __add_wait_queue_tail(&x->wait, &wait); 4688 __add_wait_queue_tail(&x->wait, &wait);
4659 do { 4689 do {
4660 if ((state == TASK_INTERRUPTIBLE && 4690 if (signal_pending_state(state, current)) {
4661 signal_pending(current)) ||
4662 (state == TASK_KILLABLE &&
4663 fatal_signal_pending(current))) {
4664 timeout = -ERESTARTSYS; 4691 timeout = -ERESTARTSYS;
4665 break; 4692 break;
4666 } 4693 }
@@ -4688,12 +4715,31 @@ wait_for_common(struct completion *x, long timeout, int state)
4688 return timeout; 4715 return timeout;
4689} 4716}
4690 4717
4718/**
4719 * wait_for_completion: - waits for completion of a task
4720 * @x: holds the state of this particular completion
4721 *
4722 * This waits to be signaled for completion of a specific task. It is NOT
4723 * interruptible and there is no timeout.
4724 *
4725 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4726 * and interrupt capability. Also see complete().
4727 */
4691void __sched wait_for_completion(struct completion *x) 4728void __sched wait_for_completion(struct completion *x)
4692{ 4729{
4693 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4730 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4694} 4731}
4695EXPORT_SYMBOL(wait_for_completion); 4732EXPORT_SYMBOL(wait_for_completion);
4696 4733
4734/**
4735 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4736 * @x: holds the state of this particular completion
4737 * @timeout: timeout value in jiffies
4738 *
4739 * This waits for either a completion of a specific task to be signaled or for a
4740 * specified timeout to expire. The timeout is in jiffies. It is not
4741 * interruptible.
4742 */
4697unsigned long __sched 4743unsigned long __sched
4698wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4744wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4699{ 4745{
@@ -4701,6 +4747,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4701} 4747}
4702EXPORT_SYMBOL(wait_for_completion_timeout); 4748EXPORT_SYMBOL(wait_for_completion_timeout);
4703 4749
4750/**
4751 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4752 * @x: holds the state of this particular completion
4753 *
4754 * This waits for completion of a specific task to be signaled. It is
4755 * interruptible.
4756 */
4704int __sched wait_for_completion_interruptible(struct completion *x) 4757int __sched wait_for_completion_interruptible(struct completion *x)
4705{ 4758{
4706 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4759 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4710,6 +4763,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
4710} 4763}
4711EXPORT_SYMBOL(wait_for_completion_interruptible); 4764EXPORT_SYMBOL(wait_for_completion_interruptible);
4712 4765
4766/**
4767 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4768 * @x: holds the state of this particular completion
4769 * @timeout: timeout value in jiffies
4770 *
4771 * This waits for either a completion of a specific task to be signaled or for a
4772 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4773 */
4713unsigned long __sched 4774unsigned long __sched
4714wait_for_completion_interruptible_timeout(struct completion *x, 4775wait_for_completion_interruptible_timeout(struct completion *x,
4715 unsigned long timeout) 4776 unsigned long timeout)
@@ -4718,6 +4779,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
4718} 4779}
4719EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4780EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4720 4781
4782/**
4783 * wait_for_completion_killable: - waits for completion of a task (killable)
4784 * @x: holds the state of this particular completion
4785 *
4786 * This waits to be signaled for completion of a specific task. It can be
4787 * interrupted by a kill signal.
4788 */
4721int __sched wait_for_completion_killable(struct completion *x) 4789int __sched wait_for_completion_killable(struct completion *x)
4722{ 4790{
4723 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4791 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -5120,7 +5188,8 @@ recheck:
5120 * Do not allow realtime tasks into groups that have no runtime 5188 * Do not allow realtime tasks into groups that have no runtime
5121 * assigned. 5189 * assigned.
5122 */ 5190 */
5123 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5191 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5192 task_group(p)->rt_bandwidth.rt_runtime == 0)
5124 return -EPERM; 5193 return -EPERM;
5125#endif 5194#endif
5126 5195
@@ -5956,7 +6025,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5956 set_task_cpu(p, dest_cpu); 6025 set_task_cpu(p, dest_cpu);
5957 if (on_rq) { 6026 if (on_rq) {
5958 activate_task(rq_dest, p, 0); 6027 activate_task(rq_dest, p, 0);
5959 check_preempt_curr(rq_dest, p); 6028 check_preempt_curr(rq_dest, p, 0);
5960 } 6029 }
5961done: 6030done:
5962 ret = 1; 6031 ret = 1;
@@ -6281,7 +6350,7 @@ set_table_entry(struct ctl_table *entry,
6281static struct ctl_table * 6350static struct ctl_table *
6282sd_alloc_ctl_domain_table(struct sched_domain *sd) 6351sd_alloc_ctl_domain_table(struct sched_domain *sd)
6283{ 6352{
6284 struct ctl_table *table = sd_alloc_ctl_entry(12); 6353 struct ctl_table *table = sd_alloc_ctl_entry(13);
6285 6354
6286 if (table == NULL) 6355 if (table == NULL)
6287 return NULL; 6356 return NULL;
@@ -6309,7 +6378,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
6309 sizeof(int), 0644, proc_dointvec_minmax); 6378 sizeof(int), 0644, proc_dointvec_minmax);
6310 set_table_entry(&table[10], "flags", &sd->flags, 6379 set_table_entry(&table[10], "flags", &sd->flags,
6311 sizeof(int), 0644, proc_dointvec_minmax); 6380 sizeof(int), 0644, proc_dointvec_minmax);
6312 /* &table[11] is terminator */ 6381 set_table_entry(&table[11], "name", sd->name,
6382 CORENAME_MAX_SIZE, 0444, proc_dostring);
6383 /* &table[12] is terminator */
6313 6384
6314 return table; 6385 return table;
6315} 6386}
@@ -7193,13 +7264,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7193 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 7264 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7194 */ 7265 */
7195 7266
7267#ifdef CONFIG_SCHED_DEBUG
7268# define SD_INIT_NAME(sd, type) sd->name = #type
7269#else
7270# define SD_INIT_NAME(sd, type) do { } while (0)
7271#endif
7272
7196#define SD_INIT(sd, type) sd_init_##type(sd) 7273#define SD_INIT(sd, type) sd_init_##type(sd)
7274
7197#define SD_INIT_FUNC(type) \ 7275#define SD_INIT_FUNC(type) \
7198static noinline void sd_init_##type(struct sched_domain *sd) \ 7276static noinline void sd_init_##type(struct sched_domain *sd) \
7199{ \ 7277{ \
7200 memset(sd, 0, sizeof(*sd)); \ 7278 memset(sd, 0, sizeof(*sd)); \
7201 *sd = SD_##type##_INIT; \ 7279 *sd = SD_##type##_INIT; \
7202 sd->level = SD_LV_##type; \ 7280 sd->level = SD_LV_##type; \
7281 SD_INIT_NAME(sd, type); \
7203} 7282}
7204 7283
7205SD_INIT_FUNC(CPU) 7284SD_INIT_FUNC(CPU)
@@ -7695,24 +7774,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7695 * and partition_sched_domains() will fallback to the single partition 7774 * and partition_sched_domains() will fallback to the single partition
7696 * 'fallback_doms', it also forces the domains to be rebuilt. 7775 * 'fallback_doms', it also forces the domains to be rebuilt.
7697 * 7776 *
7777 * If doms_new==NULL it will be replaced with cpu_online_map.
7778 * ndoms_new==0 is a special case for destroying existing domains.
7779 * It will not create the default domain.
7780 *
7698 * Call with hotplug lock held 7781 * Call with hotplug lock held
7699 */ 7782 */
7700void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7783void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7701 struct sched_domain_attr *dattr_new) 7784 struct sched_domain_attr *dattr_new)
7702{ 7785{
7703 int i, j; 7786 int i, j, n;
7704 7787
7705 mutex_lock(&sched_domains_mutex); 7788 mutex_lock(&sched_domains_mutex);
7706 7789
7707 /* always unregister in case we don't destroy any domains */ 7790 /* always unregister in case we don't destroy any domains */
7708 unregister_sched_domain_sysctl(); 7791 unregister_sched_domain_sysctl();
7709 7792
7710 if (doms_new == NULL) 7793 n = doms_new ? ndoms_new : 0;
7711 ndoms_new = 0;
7712 7794
7713 /* Destroy deleted domains */ 7795 /* Destroy deleted domains */
7714 for (i = 0; i < ndoms_cur; i++) { 7796 for (i = 0; i < ndoms_cur; i++) {
7715 for (j = 0; j < ndoms_new; j++) { 7797 for (j = 0; j < n; j++) {
7716 if (cpus_equal(doms_cur[i], doms_new[j]) 7798 if (cpus_equal(doms_cur[i], doms_new[j])
7717 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7799 && dattrs_equal(dattr_cur, i, dattr_new, j))
7718 goto match1; 7800 goto match1;
@@ -7725,7 +7807,6 @@ match1:
7725 7807
7726 if (doms_new == NULL) { 7808 if (doms_new == NULL) {
7727 ndoms_cur = 0; 7809 ndoms_cur = 0;
7728 ndoms_new = 1;
7729 doms_new = &fallback_doms; 7810 doms_new = &fallback_doms;
7730 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7811 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7731 dattr_new = NULL; 7812 dattr_new = NULL;
@@ -7762,8 +7843,13 @@ match2:
7762int arch_reinit_sched_domains(void) 7843int arch_reinit_sched_domains(void)
7763{ 7844{
7764 get_online_cpus(); 7845 get_online_cpus();
7846
7847 /* Destroy domains first to force the rebuild */
7848 partition_sched_domains(0, NULL, NULL);
7849
7765 rebuild_sched_domains(); 7850 rebuild_sched_domains();
7766 put_online_cpus(); 7851 put_online_cpus();
7852
7767 return 0; 7853 return 0;
7768} 7854}
7769 7855
@@ -7847,7 +7933,7 @@ static int update_sched_domains(struct notifier_block *nfb,
7847 case CPU_ONLINE_FROZEN: 7933 case CPU_ONLINE_FROZEN:
7848 case CPU_DEAD: 7934 case CPU_DEAD:
7849 case CPU_DEAD_FROZEN: 7935 case CPU_DEAD_FROZEN:
7850 partition_sched_domains(0, NULL, NULL); 7936 partition_sched_domains(1, NULL, NULL);
7851 return NOTIFY_OK; 7937 return NOTIFY_OK;
7852 7938
7853 default: 7939 default:
@@ -8234,20 +8320,25 @@ void __might_sleep(char *file, int line)
8234#ifdef in_atomic 8320#ifdef in_atomic
8235 static unsigned long prev_jiffy; /* ratelimiting */ 8321 static unsigned long prev_jiffy; /* ratelimiting */
8236 8322
8237 if ((in_atomic() || irqs_disabled()) && 8323 if ((!in_atomic() && !irqs_disabled()) ||
8238 system_state == SYSTEM_RUNNING && !oops_in_progress) { 8324 system_state != SYSTEM_RUNNING || oops_in_progress)
8239 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8325 return;
8240 return; 8326 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8241 prev_jiffy = jiffies; 8327 return;
8242 printk(KERN_ERR "BUG: sleeping function called from invalid" 8328 prev_jiffy = jiffies;
8243 " context at %s:%d\n", file, line); 8329
8244 printk("in_atomic():%d, irqs_disabled():%d\n", 8330 printk(KERN_ERR
8245 in_atomic(), irqs_disabled()); 8331 "BUG: sleeping function called from invalid context at %s:%d\n",
8246 debug_show_held_locks(current); 8332 file, line);
8247 if (irqs_disabled()) 8333 printk(KERN_ERR
8248 print_irqtrace_events(current); 8334 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8249 dump_stack(); 8335 in_atomic(), irqs_disabled(),
8250 } 8336 current->pid, current->comm);
8337
8338 debug_show_held_locks(current);
8339 if (irqs_disabled())
8340 print_irqtrace_events(current);
8341 dump_stack();
8251#endif 8342#endif
8252} 8343}
8253EXPORT_SYMBOL(__might_sleep); 8344EXPORT_SYMBOL(__might_sleep);
@@ -8745,73 +8836,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
8745static unsigned long to_ratio(u64 period, u64 runtime) 8836static unsigned long to_ratio(u64 period, u64 runtime)
8746{ 8837{
8747 if (runtime == RUNTIME_INF) 8838 if (runtime == RUNTIME_INF)
8748 return 1ULL << 16; 8839 return 1ULL << 20;
8749 8840
8750 return div64_u64(runtime << 16, period); 8841 return div64_u64(runtime << 20, period);
8751} 8842}
8752 8843
8753#ifdef CONFIG_CGROUP_SCHED 8844/* Must be called with tasklist_lock held */
8754static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8845static inline int tg_has_rt_tasks(struct task_group *tg)
8755{ 8846{
8756 struct task_group *tgi, *parent = tg->parent; 8847 struct task_struct *g, *p;
8757 unsigned long total = 0;
8758 8848
8759 if (!parent) { 8849 do_each_thread(g, p) {
8760 if (global_rt_period() < period) 8850 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8761 return 0; 8851 return 1;
8852 } while_each_thread(g, p);
8762 8853
8763 return to_ratio(period, runtime) < 8854 return 0;
8764 to_ratio(global_rt_period(), global_rt_runtime()); 8855}
8765 }
8766 8856
8767 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) 8857struct rt_schedulable_data {
8768 return 0; 8858 struct task_group *tg;
8859 u64 rt_period;
8860 u64 rt_runtime;
8861};
8769 8862
8770 rcu_read_lock(); 8863static int tg_schedulable(struct task_group *tg, void *data)
8771 list_for_each_entry_rcu(tgi, &parent->children, siblings) { 8864{
8772 if (tgi == tg) 8865 struct rt_schedulable_data *d = data;
8773 continue; 8866 struct task_group *child;
8867 unsigned long total, sum = 0;
8868 u64 period, runtime;
8869
8870 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8871 runtime = tg->rt_bandwidth.rt_runtime;
8774 8872
8775 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8873 if (tg == d->tg) {
8776 tgi->rt_bandwidth.rt_runtime); 8874 period = d->rt_period;
8875 runtime = d->rt_runtime;
8777 } 8876 }
8778 rcu_read_unlock();
8779 8877
8780 return total + to_ratio(period, runtime) <= 8878 /*
8781 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8879 * Cannot have more runtime than the period.
8782 parent->rt_bandwidth.rt_runtime); 8880 */
8783} 8881 if (runtime > period && runtime != RUNTIME_INF)
8784#elif defined CONFIG_USER_SCHED 8882 return -EINVAL;
8785static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8786{
8787 struct task_group *tgi;
8788 unsigned long total = 0;
8789 unsigned long global_ratio =
8790 to_ratio(global_rt_period(), global_rt_runtime());
8791 8883
8792 rcu_read_lock(); 8884 /*
8793 list_for_each_entry_rcu(tgi, &task_groups, list) { 8885 * Ensure we don't starve existing RT tasks.
8794 if (tgi == tg) 8886 */
8795 continue; 8887 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8888 return -EBUSY;
8796 8889
8797 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8890 total = to_ratio(period, runtime);
8798 tgi->rt_bandwidth.rt_runtime); 8891
8892 /*
8893 * Nobody can have more than the global setting allows.
8894 */
8895 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8896 return -EINVAL;
8897
8898 /*
8899 * The sum of our children's runtime should not exceed our own.
8900 */
8901 list_for_each_entry_rcu(child, &tg->children, siblings) {
8902 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8903 runtime = child->rt_bandwidth.rt_runtime;
8904
8905 if (child == d->tg) {
8906 period = d->rt_period;
8907 runtime = d->rt_runtime;
8908 }
8909
8910 sum += to_ratio(period, runtime);
8799 } 8911 }
8800 rcu_read_unlock();
8801 8912
8802 return total + to_ratio(period, runtime) < global_ratio; 8913 if (sum > total)
8914 return -EINVAL;
8915
8916 return 0;
8803} 8917}
8804#endif
8805 8918
8806/* Must be called with tasklist_lock held */ 8919static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8807static inline int tg_has_rt_tasks(struct task_group *tg)
8808{ 8920{
8809 struct task_struct *g, *p; 8921 struct rt_schedulable_data data = {
8810 do_each_thread(g, p) { 8922 .tg = tg,
8811 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8923 .rt_period = period,
8812 return 1; 8924 .rt_runtime = runtime,
8813 } while_each_thread(g, p); 8925 };
8814 return 0; 8926
8927 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8815} 8928}
8816 8929
8817static int tg_set_bandwidth(struct task_group *tg, 8930static int tg_set_bandwidth(struct task_group *tg,
@@ -8821,14 +8934,9 @@ static int tg_set_bandwidth(struct task_group *tg,
8821 8934
8822 mutex_lock(&rt_constraints_mutex); 8935 mutex_lock(&rt_constraints_mutex);
8823 read_lock(&tasklist_lock); 8936 read_lock(&tasklist_lock);
8824 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { 8937 err = __rt_schedulable(tg, rt_period, rt_runtime);
8825 err = -EBUSY; 8938 if (err)
8826 goto unlock;
8827 }
8828 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8829 err = -EINVAL;
8830 goto unlock; 8939 goto unlock;
8831 }
8832 8940
8833 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8941 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8834 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8942 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8897,16 +9005,25 @@ long sched_group_rt_period(struct task_group *tg)
8897 9005
8898static int sched_rt_global_constraints(void) 9006static int sched_rt_global_constraints(void)
8899{ 9007{
8900 struct task_group *tg = &root_task_group; 9008 u64 runtime, period;
8901 u64 rt_runtime, rt_period;
8902 int ret = 0; 9009 int ret = 0;
8903 9010
8904 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 9011 if (sysctl_sched_rt_period <= 0)
8905 rt_runtime = tg->rt_bandwidth.rt_runtime; 9012 return -EINVAL;
9013
9014 runtime = global_rt_runtime();
9015 period = global_rt_period();
9016
9017 /*
9018 * Sanity check on the sysctl variables.
9019 */
9020 if (runtime > period && runtime != RUNTIME_INF)
9021 return -EINVAL;
8906 9022
8907 mutex_lock(&rt_constraints_mutex); 9023 mutex_lock(&rt_constraints_mutex);
8908 if (!__rt_schedulable(tg, rt_period, rt_runtime)) 9024 read_lock(&tasklist_lock);
8909 ret = -EINVAL; 9025 ret = __rt_schedulable(NULL, 0, 0);
9026 read_unlock(&tasklist_lock);
8910 mutex_unlock(&rt_constraints_mutex); 9027 mutex_unlock(&rt_constraints_mutex);
8911 9028
8912 return ret; 9029 return ret;
@@ -8917,6 +9034,9 @@ static int sched_rt_global_constraints(void)
8917 unsigned long flags; 9034 unsigned long flags;
8918 int i; 9035 int i;
8919 9036
9037 if (sysctl_sched_rt_period <= 0)
9038 return -EINVAL;
9039
8920 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 9040 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8921 for_each_possible_cpu(i) { 9041 for_each_possible_cpu(i) {
8922 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 9042 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -8977,7 +9097,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8977 9097
8978 if (!cgrp->parent) { 9098 if (!cgrp->parent) {
8979 /* This is early initialization for the top cgroup */ 9099 /* This is early initialization for the top cgroup */
8980 init_task_group.css.cgroup = cgrp;
8981 return &init_task_group.css; 9100 return &init_task_group.css;
8982 } 9101 }
8983 9102
@@ -8986,9 +9105,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8986 if (IS_ERR(tg)) 9105 if (IS_ERR(tg))
8987 return ERR_PTR(-ENOMEM); 9106 return ERR_PTR(-ENOMEM);
8988 9107
8989 /* Bind the cgroup to task_group object we just created */
8990 tg->css.cgroup = cgrp;
8991
8992 return &tg->css; 9108 return &tg->css;
8993} 9109}
8994 9110
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e8ab096ddfe3..81787248b60f 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
118 118
119 /* 119 /*
120 * scd->clock = clamp(scd->tick_gtod + delta, 120 * scd->clock = clamp(scd->tick_gtod + delta,
121 * max(scd->tick_gtod, scd->clock), 121 * max(scd->tick_gtod, scd->clock),
122 * scd->tick_gtod + TICK_NSEC); 122 * max(scd->clock, scd->tick_gtod + TICK_NSEC));
123 */ 123 */
124 124
125 clock = scd->tick_gtod + delta; 125 clock = scd->tick_gtod + delta;
126 min_clock = wrap_max(scd->tick_gtod, scd->clock); 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
127 max_clock = scd->tick_gtod + TICK_NSEC; 127 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
128 128
129 clock = wrap_max(clock, min_clock); 129 clock = wrap_max(clock, min_clock);
130 clock = wrap_min(clock, max_clock); 130 clock = wrap_min(clock, max_clock);
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index bbe6b31c3c56..ad958c1ec708 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -333,12 +333,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
333 unsigned long flags; 333 unsigned long flags;
334 int num_threads = 1; 334 int num_threads = 1;
335 335
336 rcu_read_lock();
337 if (lock_task_sighand(p, &flags)) { 336 if (lock_task_sighand(p, &flags)) {
338 num_threads = atomic_read(&p->signal->count); 337 num_threads = atomic_read(&p->signal->count);
339 unlock_task_sighand(p, &flags); 338 unlock_task_sighand(p, &flags);
340 } 339 }
341 rcu_read_unlock();
342 340
343 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); 341 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
344 SEQ_printf(m, 342 SEQ_printf(m,
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb8994c6d4bb..18fd17172eb6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
409} 409}
410 410
411/* 411/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 *
421 */
422static unsigned long
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
424{
425 struct load_weight lw = {
426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
429
430 for_each_sched_entity(se) {
431 struct load_weight *se_lw = &se->load;
432 unsigned long rw = cfs_rq_of(se)->load.weight;
433
434#ifdef CONFIG_FAIR_SCHED_GROUP
435 struct cfs_rq *cfs_rq = se->my_q;
436 struct task_group *tg = NULL
437
438 if (cfs_rq)
439 tg = cfs_rq->tg;
440
441 if (tg && tg->shares < NICE_0_LOAD) {
442 /*
443 * scale shares to what it would have been had
444 * tg->weight been NICE_0_LOAD:
445 *
446 * weight = 1024 * shares / tg->weight
447 */
448 lw.weight *= se->load.weight;
449 lw.weight /= tg->shares;
450
451 lw.inv_weight = 0;
452
453 se_lw = &lw;
454 rw += lw.weight - se->load.weight;
455 } else
456#endif
457
458 if (se->load.weight < NICE_0_LOAD) {
459 se_lw = &lw;
460 rw += NICE_0_LOAD - se->load.weight;
461 }
462
463 delta = calc_delta_mine(delta, rw, se_lw);
464 }
465
466 return delta;
467}
468
469/*
470 * Update the current task's runtime statistics. Skip current tasks that 412 * Update the current task's runtime statistics. Skip current tasks that
471 * are not in our scheduling class. 413 * are not in our scheduling class.
472 */ 414 */
@@ -586,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
586 update_load_add(&cfs_rq->load, se->load.weight); 528 update_load_add(&cfs_rq->load, se->load.weight);
587 if (!parent_entity(se)) 529 if (!parent_entity(se))
588 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 530 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
589 if (entity_is_task(se)) 531 if (entity_is_task(se)) {
590 add_cfs_task_weight(cfs_rq, se->load.weight); 532 add_cfs_task_weight(cfs_rq, se->load.weight);
533 list_add(&se->group_node, &cfs_rq->tasks);
534 }
591 cfs_rq->nr_running++; 535 cfs_rq->nr_running++;
592 se->on_rq = 1; 536 se->on_rq = 1;
593 list_add(&se->group_node, &cfs_rq->tasks);
594} 537}
595 538
596static void 539static void
@@ -599,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
599 update_load_sub(&cfs_rq->load, se->load.weight); 542 update_load_sub(&cfs_rq->load, se->load.weight);
600 if (!parent_entity(se)) 543 if (!parent_entity(se))
601 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 544 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
602 if (entity_is_task(se)) 545 if (entity_is_task(se)) {
603 add_cfs_task_weight(cfs_rq, -se->load.weight); 546 add_cfs_task_weight(cfs_rq, -se->load.weight);
547 list_del_init(&se->group_node);
548 }
604 cfs_rq->nr_running--; 549 cfs_rq->nr_running--;
605 se->on_rq = 0; 550 se->on_rq = 0;
606 list_del_init(&se->group_node);
607} 551}
608 552
609static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 553static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1085,7 +1029,6 @@ static long effective_load(struct task_group *tg, int cpu,
1085 long wl, long wg) 1029 long wl, long wg)
1086{ 1030{
1087 struct sched_entity *se = tg->se[cpu]; 1031 struct sched_entity *se = tg->se[cpu];
1088 long more_w;
1089 1032
1090 if (!tg->parent) 1033 if (!tg->parent)
1091 return wl; 1034 return wl;
@@ -1097,18 +1040,17 @@ static long effective_load(struct task_group *tg, int cpu,
1097 if (!wl && sched_feat(ASYM_EFF_LOAD)) 1040 if (!wl && sched_feat(ASYM_EFF_LOAD))
1098 return wl; 1041 return wl;
1099 1042
1100 /*
1101 * Instead of using this increment, also add the difference
1102 * between when the shares were last updated and now.
1103 */
1104 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1105 wl += more_w;
1106 wg += more_w;
1107
1108 for_each_sched_entity(se) { 1043 for_each_sched_entity(se) {
1109#define D(n) (likely(n) ? (n) : 1)
1110
1111 long S, rw, s, a, b; 1044 long S, rw, s, a, b;
1045 long more_w;
1046
1047 /*
1048 * Instead of using this increment, also add the difference
1049 * between when the shares were last updated and now.
1050 */
1051 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1052 wl += more_w;
1053 wg += more_w;
1112 1054
1113 S = se->my_q->tg->shares; 1055 S = se->my_q->tg->shares;
1114 s = se->my_q->shares; 1056 s = se->my_q->shares;
@@ -1117,7 +1059,11 @@ static long effective_load(struct task_group *tg, int cpu,
1117 a = S*(rw + wl); 1059 a = S*(rw + wl);
1118 b = S*rw + s*wg; 1060 b = S*rw + s*wg;
1119 1061
1120 wl = s*(a-b)/D(b); 1062 wl = s*(a-b);
1063
1064 if (likely(b))
1065 wl /= b;
1066
1121 /* 1067 /*
1122 * Assume the group is already running and will 1068 * Assume the group is already running and will
1123 * thus already be accounted for in the weight. 1069 * thus already be accounted for in the weight.
@@ -1126,7 +1072,6 @@ static long effective_load(struct task_group *tg, int cpu,
1126 * alter the group weight. 1072 * alter the group weight.
1127 */ 1073 */
1128 wg = 0; 1074 wg = 0;
1129#undef D
1130 } 1075 }
1131 1076
1132 return wl; 1077 return wl;
@@ -1143,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1143#endif 1088#endif
1144 1089
1145static int 1090static int
1146wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, 1091wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1147 struct task_struct *p, int prev_cpu, int this_cpu, int sync, 1092 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1148 int idx, unsigned long load, unsigned long this_load, 1093 int idx, unsigned long load, unsigned long this_load,
1149 unsigned int imbalance) 1094 unsigned int imbalance)
@@ -1158,6 +1103,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1158 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1103 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1159 return 0; 1104 return 0;
1160 1105
1106 if (!sync && sched_feat(SYNC_WAKEUPS) &&
1107 curr->se.avg_overlap < sysctl_sched_migration_cost &&
1108 p->se.avg_overlap < sysctl_sched_migration_cost)
1109 sync = 1;
1110
1161 /* 1111 /*
1162 * If sync wakeup then subtract the (maximum possible) 1112 * If sync wakeup then subtract the (maximum possible)
1163 * effect of the currently running task from the load 1113 * effect of the currently running task from the load
@@ -1182,17 +1132,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1182 * a reasonable amount of time then attract this newly 1132 * a reasonable amount of time then attract this newly
1183 * woken task: 1133 * woken task:
1184 */ 1134 */
1185 if (sync && balanced) { 1135 if (sync && balanced)
1186 if (curr->se.avg_overlap < sysctl_sched_migration_cost && 1136 return 1;
1187 p->se.avg_overlap < sysctl_sched_migration_cost)
1188 return 1;
1189 }
1190 1137
1191 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1138 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1192 tl_per_task = cpu_avg_load_per_task(this_cpu); 1139 tl_per_task = cpu_avg_load_per_task(this_cpu);
1193 1140
1194 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || 1141 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
1195 balanced) { 1142 tl_per_task)) {
1196 /* 1143 /*
1197 * This domain has SD_WAKE_AFFINE and 1144 * This domain has SD_WAKE_AFFINE and
1198 * p is cache cold in this domain, and 1145 * p is cache cold in this domain, and
@@ -1211,16 +1158,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1211 struct sched_domain *sd, *this_sd = NULL; 1158 struct sched_domain *sd, *this_sd = NULL;
1212 int prev_cpu, this_cpu, new_cpu; 1159 int prev_cpu, this_cpu, new_cpu;
1213 unsigned long load, this_load; 1160 unsigned long load, this_load;
1214 struct rq *rq, *this_rq; 1161 struct rq *this_rq;
1215 unsigned int imbalance; 1162 unsigned int imbalance;
1216 int idx; 1163 int idx;
1217 1164
1218 prev_cpu = task_cpu(p); 1165 prev_cpu = task_cpu(p);
1219 rq = task_rq(p);
1220 this_cpu = smp_processor_id(); 1166 this_cpu = smp_processor_id();
1221 this_rq = cpu_rq(this_cpu); 1167 this_rq = cpu_rq(this_cpu);
1222 new_cpu = prev_cpu; 1168 new_cpu = prev_cpu;
1223 1169
1170 if (prev_cpu == this_cpu)
1171 goto out;
1224 /* 1172 /*
1225 * 'this_sd' is the first domain that both 1173 * 'this_sd' is the first domain that both
1226 * this_cpu and prev_cpu are present in: 1174 * this_cpu and prev_cpu are present in:
@@ -1248,13 +1196,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1248 load = source_load(prev_cpu, idx); 1196 load = source_load(prev_cpu, idx);
1249 this_load = target_load(this_cpu, idx); 1197 this_load = target_load(this_cpu, idx);
1250 1198
1251 if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1199 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1252 load, this_load, imbalance)) 1200 load, this_load, imbalance))
1253 return this_cpu; 1201 return this_cpu;
1254 1202
1255 if (prev_cpu == this_cpu)
1256 goto out;
1257
1258 /* 1203 /*
1259 * Start passive balancing when half the imbalance_pct 1204 * Start passive balancing when half the imbalance_pct
1260 * limit is reached. 1205 * limit is reached.
@@ -1281,62 +1226,20 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1281 * + nice tasks. 1226 * + nice tasks.
1282 */ 1227 */
1283 if (sched_feat(ASYM_GRAN)) 1228 if (sched_feat(ASYM_GRAN))
1284 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); 1229 gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
1285 else
1286 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1287 1230
1288 return gran; 1231 return gran;
1289} 1232}
1290 1233
1291/* 1234/*
1292 * Should 'se' preempt 'curr'.
1293 *
1294 * |s1
1295 * |s2
1296 * |s3
1297 * g
1298 * |<--->|c
1299 *
1300 * w(c, s1) = -1
1301 * w(c, s2) = 0
1302 * w(c, s3) = 1
1303 *
1304 */
1305static int
1306wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1307{
1308 s64 gran, vdiff = curr->vruntime - se->vruntime;
1309
1310 if (vdiff < 0)
1311 return -1;
1312
1313 gran = wakeup_gran(curr);
1314 if (vdiff > gran)
1315 return 1;
1316
1317 return 0;
1318}
1319
1320/* return depth at which a sched entity is present in the hierarchy */
1321static inline int depth_se(struct sched_entity *se)
1322{
1323 int depth = 0;
1324
1325 for_each_sched_entity(se)
1326 depth++;
1327
1328 return depth;
1329}
1330
1331/*
1332 * Preempt the current task with a newly woken task if needed: 1235 * Preempt the current task with a newly woken task if needed:
1333 */ 1236 */
1334static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1237static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1335{ 1238{
1336 struct task_struct *curr = rq->curr; 1239 struct task_struct *curr = rq->curr;
1337 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1240 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1338 struct sched_entity *se = &curr->se, *pse = &p->se; 1241 struct sched_entity *se = &curr->se, *pse = &p->se;
1339 int se_depth, pse_depth; 1242 s64 delta_exec;
1340 1243
1341 if (unlikely(rt_prio(p->prio))) { 1244 if (unlikely(rt_prio(p->prio))) {
1342 update_rq_clock(rq); 1245 update_rq_clock(rq);
@@ -1351,6 +1254,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1351 cfs_rq_of(pse)->next = pse; 1254 cfs_rq_of(pse)->next = pse;
1352 1255
1353 /* 1256 /*
1257 * We can come here with TIF_NEED_RESCHED already set from new task
1258 * wake up path.
1259 */
1260 if (test_tsk_need_resched(curr))
1261 return;
1262
1263 /*
1354 * Batch tasks do not preempt (their preemption is driven by 1264 * Batch tasks do not preempt (their preemption is driven by
1355 * the tick): 1265 * the tick):
1356 */ 1266 */
@@ -1360,33 +1270,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1360 if (!sched_feat(WAKEUP_PREEMPT)) 1270 if (!sched_feat(WAKEUP_PREEMPT))
1361 return; 1271 return;
1362 1272
1363 /* 1273 if (sched_feat(WAKEUP_OVERLAP) && (sync ||
1364 * preemption test can be made between sibling entities who are in the 1274 (se->avg_overlap < sysctl_sched_migration_cost &&
1365 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of 1275 pse->avg_overlap < sysctl_sched_migration_cost))) {
1366 * both tasks until we find their ancestors who are siblings of common 1276 resched_task(curr);
1367 * parent. 1277 return;
1368 */
1369
1370 /* First walk up until both entities are at same depth */
1371 se_depth = depth_se(se);
1372 pse_depth = depth_se(pse);
1373
1374 while (se_depth > pse_depth) {
1375 se_depth--;
1376 se = parent_entity(se);
1377 }
1378
1379 while (pse_depth > se_depth) {
1380 pse_depth--;
1381 pse = parent_entity(pse);
1382 }
1383
1384 while (!is_same_group(se, pse)) {
1385 se = parent_entity(se);
1386 pse = parent_entity(pse);
1387 } 1278 }
1388 1279
1389 if (wakeup_preempt_entity(se, pse) == 1) 1280 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1281 if (delta_exec > wakeup_gran(pse))
1390 resched_task(curr); 1282 resched_task(curr);
1391} 1283}
1392 1284
@@ -1445,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1445 if (next == &cfs_rq->tasks) 1337 if (next == &cfs_rq->tasks)
1446 return NULL; 1338 return NULL;
1447 1339
1448 /* Skip over entities that are not tasks */ 1340 se = list_entry(next, struct sched_entity, group_node);
1449 do { 1341 p = task_of(se);
1450 se = list_entry(next, struct sched_entity, group_node); 1342 cfs_rq->balance_iterator = next->next;
1451 next = next->next;
1452 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1453
1454 if (next == &cfs_rq->tasks)
1455 return NULL;
1456
1457 cfs_rq->balance_iterator = next;
1458
1459 if (entity_is_task(se))
1460 p = task_of(se);
1461 1343
1462 return p; 1344 return p;
1463} 1345}
@@ -1507,7 +1389,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1507 rcu_read_lock(); 1389 rcu_read_lock();
1508 update_h_load(busiest_cpu); 1390 update_h_load(busiest_cpu);
1509 1391
1510 list_for_each_entry(tg, &task_groups, list) { 1392 list_for_each_entry_rcu(tg, &task_groups, list) {
1511 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; 1393 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1512 unsigned long busiest_h_load = busiest_cfs_rq->h_load; 1394 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
1513 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 1395 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@ -1620,10 +1502,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1620 * 'current' within the tree based on its new key value. 1502 * 'current' within the tree based on its new key value.
1621 */ 1503 */
1622 swap(curr->vruntime, se->vruntime); 1504 swap(curr->vruntime, se->vruntime);
1505 resched_task(rq->curr);
1623 } 1506 }
1624 1507
1625 enqueue_task_fair(rq, p, 0); 1508 enqueue_task_fair(rq, p, 0);
1626 resched_task(rq->curr);
1627} 1509}
1628 1510
1629/* 1511/*
@@ -1642,7 +1524,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1642 if (p->prio > oldprio) 1524 if (p->prio > oldprio)
1643 resched_task(rq->curr); 1525 resched_task(rq->curr);
1644 } else 1526 } else
1645 check_preempt_curr(rq, p); 1527 check_preempt_curr(rq, p, 0);
1646} 1528}
1647 1529
1648/* 1530/*
@@ -1659,7 +1541,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
1659 if (running) 1541 if (running)
1660 resched_task(rq->curr); 1542 resched_task(rq->curr);
1661 else 1543 else
1662 check_preempt_curr(rq, p); 1544 check_preempt_curr(rq, p, 0);
1663} 1545}
1664 1546
1665/* Account for a task changing its policy or group. 1547/* Account for a task changing its policy or group.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 9353ca78154e..7c9e8f4a049f 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1)
11SCHED_FEAT(LB_BIAS, 1) 11SCHED_FEAT(LB_BIAS, 1)
12SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 12SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1) 13SCHED_FEAT(ASYM_EFF_LOAD, 1)
14SCHED_FEAT(WAKEUP_OVERLAP, 0)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3a4f92dbbe66..dec4ccabe2f5 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
76 if (running) 76 if (running)
77 resched_task(rq->curr); 77 resched_task(rq->curr);
78 else 78 else
79 check_preempt_curr(rq, p); 79 check_preempt_curr(rq, p, 0);
80} 80}
81 81
82static void prio_changed_idle(struct rq *rq, struct task_struct *p, 82static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
93 if (p->prio > oldprio) 93 if (p->prio > oldprio)
94 resched_task(rq->curr); 94 resched_task(rq->curr);
95 } else 95 } else
96 check_preempt_curr(rq, p); 96 check_preempt_curr(rq, p, 0);
97} 97}
98 98
99/* 99/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 552310798dad..cdf5740ab03e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
102 102
103static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 103static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
104{ 104{
105 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
105 struct sched_rt_entity *rt_se = rt_rq->rt_se; 106 struct sched_rt_entity *rt_se = rt_rq->rt_se;
106 107
107 if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { 108 if (rt_rq->rt_nr_running) {
108 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 109 if (rt_se && !on_rt_rq(rt_se))
109 110 enqueue_rt_entity(rt_se);
110 enqueue_rt_entity(rt_se);
111 if (rt_rq->highest_prio < curr->prio) 111 if (rt_rq->highest_prio < curr->prio)
112 resched_task(curr); 112 resched_task(curr);
113 } 113 }
@@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
231#endif /* CONFIG_RT_GROUP_SCHED */ 231#endif /* CONFIG_RT_GROUP_SCHED */
232 232
233#ifdef CONFIG_SMP 233#ifdef CONFIG_SMP
234/*
235 * We ran out of runtime, see if we can borrow some from our neighbours.
236 */
234static int do_balance_runtime(struct rt_rq *rt_rq) 237static int do_balance_runtime(struct rt_rq *rt_rq)
235{ 238{
236 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 239 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
250 continue; 253 continue;
251 254
252 spin_lock(&iter->rt_runtime_lock); 255 spin_lock(&iter->rt_runtime_lock);
256 /*
257 * Either all rqs have inf runtime and there's nothing to steal
258 * or __disable_runtime() below sets a specific rq to inf to
259 * indicate its been disabled and disalow stealing.
260 */
253 if (iter->rt_runtime == RUNTIME_INF) 261 if (iter->rt_runtime == RUNTIME_INF)
254 goto next; 262 goto next;
255 263
264 /*
265 * From runqueues with spare time, take 1/n part of their
266 * spare time, but no more than our period.
267 */
256 diff = iter->rt_runtime - iter->rt_time; 268 diff = iter->rt_runtime - iter->rt_time;
257 if (diff > 0) { 269 if (diff > 0) {
258 diff = div_u64((u64)diff, weight); 270 diff = div_u64((u64)diff, weight);
@@ -274,6 +286,9 @@ next:
274 return more; 286 return more;
275} 287}
276 288
289/*
290 * Ensure this RQ takes back all the runtime it lend to its neighbours.
291 */
277static void __disable_runtime(struct rq *rq) 292static void __disable_runtime(struct rq *rq)
278{ 293{
279 struct root_domain *rd = rq->rd; 294 struct root_domain *rd = rq->rd;
@@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq)
289 304
290 spin_lock(&rt_b->rt_runtime_lock); 305 spin_lock(&rt_b->rt_runtime_lock);
291 spin_lock(&rt_rq->rt_runtime_lock); 306 spin_lock(&rt_rq->rt_runtime_lock);
307 /*
308 * Either we're all inf and nobody needs to borrow, or we're
309 * already disabled and thus have nothing to do, or we have
310 * exactly the right amount of runtime to take out.
311 */
292 if (rt_rq->rt_runtime == RUNTIME_INF || 312 if (rt_rq->rt_runtime == RUNTIME_INF ||
293 rt_rq->rt_runtime == rt_b->rt_runtime) 313 rt_rq->rt_runtime == rt_b->rt_runtime)
294 goto balanced; 314 goto balanced;
295 spin_unlock(&rt_rq->rt_runtime_lock); 315 spin_unlock(&rt_rq->rt_runtime_lock);
296 316
317 /*
318 * Calculate the difference between what we started out with
319 * and what we current have, that's the amount of runtime
320 * we lend and now have to reclaim.
321 */
297 want = rt_b->rt_runtime - rt_rq->rt_runtime; 322 want = rt_b->rt_runtime - rt_rq->rt_runtime;
298 323
324 /*
325 * Greedy reclaim, take back as much as we can.
326 */
299 for_each_cpu_mask(i, rd->span) { 327 for_each_cpu_mask(i, rd->span) {
300 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
301 s64 diff; 329 s64 diff;
302 330
331 /*
332 * Can't reclaim from ourselves or disabled runqueues.
333 */
303 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 334 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
304 continue; 335 continue;
305 336
@@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
319 } 350 }
320 351
321 spin_lock(&rt_rq->rt_runtime_lock); 352 spin_lock(&rt_rq->rt_runtime_lock);
353 /*
354 * We cannot be left wanting - that would mean some runtime
355 * leaked out of the system.
356 */
322 BUG_ON(want); 357 BUG_ON(want);
323balanced: 358balanced:
359 /*
360 * Disable all the borrow logic by pretending we have inf
361 * runtime - in which case borrowing doesn't make sense.
362 */
324 rt_rq->rt_runtime = RUNTIME_INF; 363 rt_rq->rt_runtime = RUNTIME_INF;
325 spin_unlock(&rt_rq->rt_runtime_lock); 364 spin_unlock(&rt_rq->rt_runtime_lock);
326 spin_unlock(&rt_b->rt_runtime_lock); 365 spin_unlock(&rt_b->rt_runtime_lock);
@@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
343 if (unlikely(!scheduler_running)) 382 if (unlikely(!scheduler_running))
344 return; 383 return;
345 384
385 /*
386 * Reset each runqueue's bandwidth settings
387 */
346 for_each_leaf_rt_rq(rt_rq, rq) { 388 for_each_leaf_rt_rq(rt_rq, rq) {
347 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 389 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
348 390
@@ -350,6 +392,7 @@ static void __enable_runtime(struct rq *rq)
350 spin_lock(&rt_rq->rt_runtime_lock); 392 spin_lock(&rt_rq->rt_runtime_lock);
351 rt_rq->rt_runtime = rt_b->rt_runtime; 393 rt_rq->rt_runtime = rt_b->rt_runtime;
352 rt_rq->rt_time = 0; 394 rt_rq->rt_time = 0;
395 rt_rq->rt_throttled = 0;
353 spin_unlock(&rt_rq->rt_runtime_lock); 396 spin_unlock(&rt_rq->rt_runtime_lock);
354 spin_unlock(&rt_b->rt_runtime_lock); 397 spin_unlock(&rt_b->rt_runtime_lock);
355 } 398 }
@@ -388,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
388 int i, idle = 1; 431 int i, idle = 1;
389 cpumask_t span; 432 cpumask_t span;
390 433
391 if (rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
392 return 1; 435 return 1;
393 436
394 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
@@ -486,6 +529,9 @@ static void update_curr_rt(struct rq *rq)
486 curr->se.exec_start = rq->clock; 529 curr->se.exec_start = rq->clock;
487 cpuacct_charge(curr, delta_exec); 530 cpuacct_charge(curr, delta_exec);
488 531
532 if (!rt_bandwidth_enabled())
533 return;
534
489 for_each_sched_rt_entity(rt_se) { 535 for_each_sched_rt_entity(rt_se) {
490 rt_rq = rt_rq_of_se(rt_se); 536 rt_rq = rt_rq_of_se(rt_se);
491 537
@@ -783,7 +829,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
783/* 829/*
784 * Preempt the current task with a newly woken task if needed: 830 * Preempt the current task with a newly woken task if needed:
785 */ 831 */
786static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) 832static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
787{ 833{
788 if (p->prio < rq->curr->prio) { 834 if (p->prio < rq->curr->prio) {
789 resched_task(rq->curr); 835 resched_task(rq->curr);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c506f266a6b9..37d67aa2d56f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -46,7 +46,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
46EXPORT_SYMBOL(irq_stat); 46EXPORT_SYMBOL(irq_stat);
47#endif 47#endif
48 48
49static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; 49static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
50 50
51static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 51static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
52 52
@@ -205,7 +205,18 @@ restart:
205 205
206 do { 206 do {
207 if (pending & 1) { 207 if (pending & 1) {
208 int prev_count = preempt_count();
209
208 h->action(h); 210 h->action(h);
211
212 if (unlikely(prev_count != preempt_count())) {
213 printk(KERN_ERR "huh, entered softirq %td %p"
214 "with preempt_count %08x,"
215 " exited with %08x?\n", h - softirq_vec,
216 h->action, prev_count, preempt_count());
217 preempt_count() = prev_count;
218 }
219
209 rcu_bh_qsctr_inc(cpu); 220 rcu_bh_qsctr_inc(cpu);
210 } 221 }
211 h++; 222 h++;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index cb838ee93a82..3953e4aed733 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -226,7 +226,7 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
226 * If the system crashed already then all bets are off, 226 * If the system crashed already then all bets are off,
227 * do not report extra hung tasks: 227 * do not report extra hung tasks:
228 */ 228 */
229 if ((tainted & TAINT_DIE) || did_panic) 229 if (test_taint(TAINT_DIE) || did_panic)
230 return; 230 return;
231 231
232 read_lock(&tasklist_lock); 232 read_lock(&tasklist_lock);
diff --git a/kernel/sys.c b/kernel/sys.c
index 1b96401a0576..fc71f99fb469 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1060,9 +1060,7 @@ asmlinkage long sys_setsid(void)
1060 group_leader->signal->leader = 1; 1060 group_leader->signal->leader = 1;
1061 __set_special_pids(sid); 1061 __set_special_pids(sid);
1062 1062
1063 spin_lock(&group_leader->sighand->siglock); 1063 proc_clear_tty(group_leader);
1064 group_leader->signal->tty = NULL;
1065 spin_unlock(&group_leader->sighand->siglock);
1066 1064
1067 err = session; 1065 err = session;
1068out: 1066out:
@@ -1351,8 +1349,10 @@ asmlinkage long sys_sethostname(char __user *name, int len)
1351 down_write(&uts_sem); 1349 down_write(&uts_sem);
1352 errno = -EFAULT; 1350 errno = -EFAULT;
1353 if (!copy_from_user(tmp, name, len)) { 1351 if (!copy_from_user(tmp, name, len)) {
1354 memcpy(utsname()->nodename, tmp, len); 1352 struct new_utsname *u = utsname();
1355 utsname()->nodename[len] = 0; 1353
1354 memcpy(u->nodename, tmp, len);
1355 memset(u->nodename + len, 0, sizeof(u->nodename) - len);
1356 errno = 0; 1356 errno = 0;
1357 } 1357 }
1358 up_write(&uts_sem); 1358 up_write(&uts_sem);
@@ -1364,15 +1364,17 @@ asmlinkage long sys_sethostname(char __user *name, int len)
1364asmlinkage long sys_gethostname(char __user *name, int len) 1364asmlinkage long sys_gethostname(char __user *name, int len)
1365{ 1365{
1366 int i, errno; 1366 int i, errno;
1367 struct new_utsname *u;
1367 1368
1368 if (len < 0) 1369 if (len < 0)
1369 return -EINVAL; 1370 return -EINVAL;
1370 down_read(&uts_sem); 1371 down_read(&uts_sem);
1371 i = 1 + strlen(utsname()->nodename); 1372 u = utsname();
1373 i = 1 + strlen(u->nodename);
1372 if (i > len) 1374 if (i > len)
1373 i = len; 1375 i = len;
1374 errno = 0; 1376 errno = 0;
1375 if (copy_to_user(name, utsname()->nodename, i)) 1377 if (copy_to_user(name, u->nodename, i))
1376 errno = -EFAULT; 1378 errno = -EFAULT;
1377 up_read(&uts_sem); 1379 up_read(&uts_sem);
1378 return errno; 1380 return errno;
@@ -1397,8 +1399,10 @@ asmlinkage long sys_setdomainname(char __user *name, int len)
1397 down_write(&uts_sem); 1399 down_write(&uts_sem);
1398 errno = -EFAULT; 1400 errno = -EFAULT;
1399 if (!copy_from_user(tmp, name, len)) { 1401 if (!copy_from_user(tmp, name, len)) {
1400 memcpy(utsname()->domainname, tmp, len); 1402 struct new_utsname *u = utsname();
1401 utsname()->domainname[len] = 0; 1403
1404 memcpy(u->domainname, tmp, len);
1405 memset(u->domainname + len, 0, sizeof(u->domainname) - len);
1402 errno = 0; 1406 errno = 0;
1403 } 1407 }
1404 up_write(&uts_sem); 1408 up_write(&uts_sem);
@@ -1452,14 +1456,22 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1452 return -EINVAL; 1456 return -EINVAL;
1453 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1457 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1454 return -EFAULT; 1458 return -EFAULT;
1455 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1456 return -EINVAL;
1457 old_rlim = current->signal->rlim + resource; 1459 old_rlim = current->signal->rlim + resource;
1458 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1460 if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
1459 !capable(CAP_SYS_RESOURCE)) 1461 !capable(CAP_SYS_RESOURCE))
1460 return -EPERM; 1462 return -EPERM;
1461 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) 1463
1462 return -EPERM; 1464 if (resource == RLIMIT_NOFILE) {
1465 if (new_rlim.rlim_max == RLIM_INFINITY)
1466 new_rlim.rlim_max = sysctl_nr_open;
1467 if (new_rlim.rlim_cur == RLIM_INFINITY)
1468 new_rlim.rlim_cur = sysctl_nr_open;
1469 if (new_rlim.rlim_max > sysctl_nr_open)
1470 return -EPERM;
1471 }
1472
1473 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1474 return -EINVAL;
1463 1475
1464 retval = security_task_setrlimit(resource, &new_rlim); 1476 retval = security_task_setrlimit(resource, &new_rlim);
1465 if (retval) 1477 if (retval)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 08d6e1bb99ac..a77b27b11b04 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -125,6 +125,12 @@ cond_syscall(sys_vm86old);
125cond_syscall(sys_vm86); 125cond_syscall(sys_vm86);
126cond_syscall(compat_sys_ipc); 126cond_syscall(compat_sys_ipc);
127cond_syscall(compat_sys_sysctl); 127cond_syscall(compat_sys_sysctl);
128cond_syscall(sys_flock);
129cond_syscall(sys_io_setup);
130cond_syscall(sys_io_destroy);
131cond_syscall(sys_io_submit);
132cond_syscall(sys_io_cancel);
133cond_syscall(sys_io_getevents);
128 134
129/* arch-specific weak syscall entries */ 135/* arch-specific weak syscall entries */
130cond_syscall(sys_pciconfig_read); 136cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 50ec0886fa3d..617d41e4d6a0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -80,7 +80,6 @@ extern int pid_max_min, pid_max_max;
80extern int sysctl_drop_caches; 80extern int sysctl_drop_caches;
81extern int percpu_pagelist_fraction; 81extern int percpu_pagelist_fraction;
82extern int compat_log; 82extern int compat_log;
83extern int maps_protect;
84extern int latencytop_enabled; 83extern int latencytop_enabled;
85extern int sysctl_nr_open_min, sysctl_nr_open_max; 84extern int sysctl_nr_open_min, sysctl_nr_open_max;
86#ifdef CONFIG_RCU_TORTURE_TEST 85#ifdef CONFIG_RCU_TORTURE_TEST
@@ -97,7 +96,7 @@ static int sixty = 60;
97static int neg_one = -1; 96static int neg_one = -1;
98#endif 97#endif
99 98
100#ifdef CONFIG_MMU 99#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
101static int two = 2; 100static int two = 2;
102#endif 101#endif
103 102
@@ -118,10 +117,8 @@ extern char modprobe_path[];
118extern int sg_big_buff; 117extern int sg_big_buff;
119#endif 118#endif
120 119
121#ifdef __sparc__ 120#ifdef CONFIG_SPARC
122extern char reboot_command []; 121#include <asm/system.h>
123extern int stop_a_enabled;
124extern int scons_pwroff;
125#endif 122#endif
126 123
127#ifdef __hppa__ 124#ifdef __hppa__
@@ -152,7 +149,7 @@ extern int max_lock_depth;
152#ifdef CONFIG_PROC_SYSCTL 149#ifdef CONFIG_PROC_SYSCTL
153static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 150static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
154 void __user *buffer, size_t *lenp, loff_t *ppos); 151 void __user *buffer, size_t *lenp, loff_t *ppos);
155static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, 152static int proc_taint(struct ctl_table *table, int write, struct file *filp,
156 void __user *buffer, size_t *lenp, loff_t *ppos); 153 void __user *buffer, size_t *lenp, loff_t *ppos);
157#endif 154#endif
158 155
@@ -382,10 +379,9 @@ static struct ctl_table kern_table[] = {
382#ifdef CONFIG_PROC_SYSCTL 379#ifdef CONFIG_PROC_SYSCTL
383 { 380 {
384 .procname = "tainted", 381 .procname = "tainted",
385 .data = &tainted, 382 .maxlen = sizeof(long),
386 .maxlen = sizeof(int),
387 .mode = 0644, 383 .mode = 0644,
388 .proc_handler = &proc_dointvec_taint, 384 .proc_handler = &proc_taint,
389 }, 385 },
390#endif 386#endif
391#ifdef CONFIG_LATENCYTOP 387#ifdef CONFIG_LATENCYTOP
@@ -415,7 +411,7 @@ static struct ctl_table kern_table[] = {
415 .mode = 0644, 411 .mode = 0644,
416 .proc_handler = &proc_dointvec, 412 .proc_handler = &proc_dointvec,
417 }, 413 },
418#ifdef __sparc__ 414#ifdef CONFIG_SPARC
419 { 415 {
420 .ctl_name = KERN_SPARC_REBOOT, 416 .ctl_name = KERN_SPARC_REBOOT,
421 .procname = "reboot-cmd", 417 .procname = "reboot-cmd",
@@ -810,16 +806,6 @@ static struct ctl_table kern_table[] = {
810 .proc_handler = &proc_dointvec, 806 .proc_handler = &proc_dointvec,
811 }, 807 },
812#endif 808#endif
813#ifdef CONFIG_PROC_FS
814 {
815 .ctl_name = CTL_UNNUMBERED,
816 .procname = "maps_protect",
817 .data = &maps_protect,
818 .maxlen = sizeof(int),
819 .mode = 0644,
820 .proc_handler = &proc_dointvec,
821 },
822#endif
823 { 809 {
824 .ctl_name = CTL_UNNUMBERED, 810 .ctl_name = CTL_UNNUMBERED,
825 .procname = "poweroff_cmd", 811 .procname = "poweroff_cmd",
@@ -1261,6 +1247,7 @@ static struct ctl_table fs_table[] = {
1261 .extra1 = &minolduid, 1247 .extra1 = &minolduid,
1262 .extra2 = &maxolduid, 1248 .extra2 = &maxolduid,
1263 }, 1249 },
1250#ifdef CONFIG_FILE_LOCKING
1264 { 1251 {
1265 .ctl_name = FS_LEASES, 1252 .ctl_name = FS_LEASES,
1266 .procname = "leases-enable", 1253 .procname = "leases-enable",
@@ -1269,6 +1256,7 @@ static struct ctl_table fs_table[] = {
1269 .mode = 0644, 1256 .mode = 0644,
1270 .proc_handler = &proc_dointvec, 1257 .proc_handler = &proc_dointvec,
1271 }, 1258 },
1259#endif
1272#ifdef CONFIG_DNOTIFY 1260#ifdef CONFIG_DNOTIFY
1273 { 1261 {
1274 .ctl_name = FS_DIR_NOTIFY, 1262 .ctl_name = FS_DIR_NOTIFY,
@@ -1280,6 +1268,7 @@ static struct ctl_table fs_table[] = {
1280 }, 1268 },
1281#endif 1269#endif
1282#ifdef CONFIG_MMU 1270#ifdef CONFIG_MMU
1271#ifdef CONFIG_FILE_LOCKING
1283 { 1272 {
1284 .ctl_name = FS_LEASE_TIME, 1273 .ctl_name = FS_LEASE_TIME,
1285 .procname = "lease-break-time", 1274 .procname = "lease-break-time",
@@ -1291,6 +1280,8 @@ static struct ctl_table fs_table[] = {
1291 .extra1 = &zero, 1280 .extra1 = &zero,
1292 .extra2 = &two, 1281 .extra2 = &two,
1293 }, 1282 },
1283#endif
1284#ifdef CONFIG_AIO
1294 { 1285 {
1295 .procname = "aio-nr", 1286 .procname = "aio-nr",
1296 .data = &aio_nr, 1287 .data = &aio_nr,
@@ -1305,6 +1296,7 @@ static struct ctl_table fs_table[] = {
1305 .mode = 0644, 1296 .mode = 0644,
1306 .proc_handler = &proc_doulongvec_minmax, 1297 .proc_handler = &proc_doulongvec_minmax,
1307 }, 1298 },
1299#endif /* CONFIG_AIO */
1308#ifdef CONFIG_INOTIFY_USER 1300#ifdef CONFIG_INOTIFY_USER
1309 { 1301 {
1310 .ctl_name = FS_INOTIFY, 1302 .ctl_name = FS_INOTIFY,
@@ -1510,7 +1502,6 @@ void register_sysctl_root(struct ctl_table_root *root)
1510/* Perform the actual read/write of a sysctl table entry. */ 1502/* Perform the actual read/write of a sysctl table entry. */
1511static int do_sysctl_strategy(struct ctl_table_root *root, 1503static int do_sysctl_strategy(struct ctl_table_root *root,
1512 struct ctl_table *table, 1504 struct ctl_table *table,
1513 int __user *name, int nlen,
1514 void __user *oldval, size_t __user *oldlenp, 1505 void __user *oldval, size_t __user *oldlenp,
1515 void __user *newval, size_t newlen) 1506 void __user *newval, size_t newlen)
1516{ 1507{
@@ -1524,8 +1515,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
1524 return -EPERM; 1515 return -EPERM;
1525 1516
1526 if (table->strategy) { 1517 if (table->strategy) {
1527 rc = table->strategy(table, name, nlen, oldval, oldlenp, 1518 rc = table->strategy(table, oldval, oldlenp, newval, newlen);
1528 newval, newlen);
1529 if (rc < 0) 1519 if (rc < 0)
1530 return rc; 1520 return rc;
1531 if (rc > 0) 1521 if (rc > 0)
@@ -1535,8 +1525,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
1535 /* If there is no strategy routine, or if the strategy returns 1525 /* If there is no strategy routine, or if the strategy returns
1536 * zero, proceed with automatic r/w */ 1526 * zero, proceed with automatic r/w */
1537 if (table->data && table->maxlen) { 1527 if (table->data && table->maxlen) {
1538 rc = sysctl_data(table, name, nlen, oldval, oldlenp, 1528 rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
1539 newval, newlen);
1540 if (rc < 0) 1529 if (rc < 0)
1541 return rc; 1530 return rc;
1542 } 1531 }
@@ -1568,7 +1557,7 @@ repeat:
1568 table = table->child; 1557 table = table->child;
1569 goto repeat; 1558 goto repeat;
1570 } 1559 }
1571 error = do_sysctl_strategy(root, table, name, nlen, 1560 error = do_sysctl_strategy(root, table,
1572 oldval, oldlenp, 1561 oldval, oldlenp,
1573 newval, newlen); 1562 newval, newlen);
1574 return error; 1563 return error;
@@ -2237,49 +2226,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2237 NULL,NULL); 2226 NULL,NULL);
2238} 2227}
2239 2228
2240#define OP_SET 0
2241#define OP_AND 1
2242#define OP_OR 2
2243
2244static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
2245 int *valp,
2246 int write, void *data)
2247{
2248 int op = *(int *)data;
2249 if (write) {
2250 int val = *negp ? -*lvalp : *lvalp;
2251 switch(op) {
2252 case OP_SET: *valp = val; break;
2253 case OP_AND: *valp &= val; break;
2254 case OP_OR: *valp |= val; break;
2255 }
2256 } else {
2257 int val = *valp;
2258 if (val < 0) {
2259 *negp = -1;
2260 *lvalp = (unsigned long)-val;
2261 } else {
2262 *negp = 0;
2263 *lvalp = (unsigned long)val;
2264 }
2265 }
2266 return 0;
2267}
2268
2269/* 2229/*
2270 * Taint values can only be increased 2230 * Taint values can only be increased
2231 * This means we can safely use a temporary.
2271 */ 2232 */
2272static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, 2233static int proc_taint(struct ctl_table *table, int write, struct file *filp,
2273 void __user *buffer, size_t *lenp, loff_t *ppos) 2234 void __user *buffer, size_t *lenp, loff_t *ppos)
2274{ 2235{
2275 int op; 2236 struct ctl_table t;
2237 unsigned long tmptaint = get_taint();
2238 int err;
2276 2239
2277 if (write && !capable(CAP_SYS_ADMIN)) 2240 if (write && !capable(CAP_SYS_ADMIN))
2278 return -EPERM; 2241 return -EPERM;
2279 2242
2280 op = OP_OR; 2243 t = *table;
2281 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2244 t.data = &tmptaint;
2282 do_proc_dointvec_bset_conv,&op); 2245 err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
2246 if (err < 0)
2247 return err;
2248
2249 if (write) {
2250 /*
2251 * Poor man's atomic or. Not worth adding a primitive
2252 * to everyone's atomic.h for this
2253 */
2254 int i;
2255 for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
2256 if ((tmptaint >> i) & 1)
2257 add_taint(i);
2258 }
2259 }
2260
2261 return err;
2283} 2262}
2284 2263
2285struct do_proc_dointvec_minmax_conv_param { 2264struct do_proc_dointvec_minmax_conv_param {
@@ -2727,7 +2706,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2727 */ 2706 */
2728 2707
2729/* The generic sysctl data routine (used if no strategy routine supplied) */ 2708/* The generic sysctl data routine (used if no strategy routine supplied) */
2730int sysctl_data(struct ctl_table *table, int __user *name, int nlen, 2709int sysctl_data(struct ctl_table *table,
2731 void __user *oldval, size_t __user *oldlenp, 2710 void __user *oldval, size_t __user *oldlenp,
2732 void __user *newval, size_t newlen) 2711 void __user *newval, size_t newlen)
2733{ 2712{
@@ -2761,7 +2740,7 @@ int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
2761} 2740}
2762 2741
2763/* The generic string strategy routine: */ 2742/* The generic string strategy routine: */
2764int sysctl_string(struct ctl_table *table, int __user *name, int nlen, 2743int sysctl_string(struct ctl_table *table,
2765 void __user *oldval, size_t __user *oldlenp, 2744 void __user *oldval, size_t __user *oldlenp,
2766 void __user *newval, size_t newlen) 2745 void __user *newval, size_t newlen)
2767{ 2746{
@@ -2807,7 +2786,7 @@ int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
2807 * are between the minimum and maximum values given in the arrays 2786 * are between the minimum and maximum values given in the arrays
2808 * table->extra1 and table->extra2, respectively. 2787 * table->extra1 and table->extra2, respectively.
2809 */ 2788 */
2810int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, 2789int sysctl_intvec(struct ctl_table *table,
2811 void __user *oldval, size_t __user *oldlenp, 2790 void __user *oldval, size_t __user *oldlenp,
2812 void __user *newval, size_t newlen) 2791 void __user *newval, size_t newlen)
2813{ 2792{
@@ -2843,7 +2822,7 @@ int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
2843} 2822}
2844 2823
2845/* Strategy function to convert jiffies to seconds */ 2824/* Strategy function to convert jiffies to seconds */
2846int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, 2825int sysctl_jiffies(struct ctl_table *table,
2847 void __user *oldval, size_t __user *oldlenp, 2826 void __user *oldval, size_t __user *oldlenp,
2848 void __user *newval, size_t newlen) 2827 void __user *newval, size_t newlen)
2849{ 2828{
@@ -2877,7 +2856,7 @@ int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
2877} 2856}
2878 2857
2879/* Strategy function to convert jiffies to seconds */ 2858/* Strategy function to convert jiffies to seconds */
2880int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, 2859int sysctl_ms_jiffies(struct ctl_table *table,
2881 void __user *oldval, size_t __user *oldlenp, 2860 void __user *oldval, size_t __user *oldlenp,
2882 void __user *newval, size_t newlen) 2861 void __user *newval, size_t newlen)
2883{ 2862{
@@ -2932,35 +2911,35 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
2932 return error; 2911 return error;
2933} 2912}
2934 2913
2935int sysctl_data(struct ctl_table *table, int __user *name, int nlen, 2914int sysctl_data(struct ctl_table *table,
2936 void __user *oldval, size_t __user *oldlenp, 2915 void __user *oldval, size_t __user *oldlenp,
2937 void __user *newval, size_t newlen) 2916 void __user *newval, size_t newlen)
2938{ 2917{
2939 return -ENOSYS; 2918 return -ENOSYS;
2940} 2919}
2941 2920
2942int sysctl_string(struct ctl_table *table, int __user *name, int nlen, 2921int sysctl_string(struct ctl_table *table,
2943 void __user *oldval, size_t __user *oldlenp, 2922 void __user *oldval, size_t __user *oldlenp,
2944 void __user *newval, size_t newlen) 2923 void __user *newval, size_t newlen)
2945{ 2924{
2946 return -ENOSYS; 2925 return -ENOSYS;
2947} 2926}
2948 2927
2949int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, 2928int sysctl_intvec(struct ctl_table *table,
2950 void __user *oldval, size_t __user *oldlenp, 2929 void __user *oldval, size_t __user *oldlenp,
2951 void __user *newval, size_t newlen) 2930 void __user *newval, size_t newlen)
2952{ 2931{
2953 return -ENOSYS; 2932 return -ENOSYS;
2954} 2933}
2955 2934
2956int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, 2935int sysctl_jiffies(struct ctl_table *table,
2957 void __user *oldval, size_t __user *oldlenp, 2936 void __user *oldval, size_t __user *oldlenp,
2958 void __user *newval, size_t newlen) 2937 void __user *newval, size_t newlen)
2959{ 2938{
2960 return -ENOSYS; 2939 return -ENOSYS;
2961} 2940}
2962 2941
2963int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, 2942int sysctl_ms_jiffies(struct ctl_table *table,
2964 void __user *oldval, size_t __user *oldlenp, 2943 void __user *oldval, size_t __user *oldlenp,
2965 void __user *newval, size_t newlen) 2944 void __user *newval, size_t newlen)
2966{ 2945{
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8d53106a0a92..95ed42951e0a 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -3,7 +3,6 @@
3# 3#
4config TICK_ONESHOT 4config TICK_ONESHOT
5 bool 5 bool
6 default n
7 6
8config NO_HZ 7config NO_HZ
9 bool "Tickless System (Dynamic Ticks)" 8 bool "Tickless System (Dynamic Ticks)"
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3d1e3e1a1971..f8d968063cea 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev,
72} 72}
73 73
74/** 74/**
75 * clockevents_shutdown - shutdown the device and clear next_event
76 * @dev: device to shutdown
77 */
78void clockevents_shutdown(struct clock_event_device *dev)
79{
80 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
81 dev->next_event.tv64 = KTIME_MAX;
82}
83
84/**
75 * clockevents_program_event - Reprogram the clock event device. 85 * clockevents_program_event - Reprogram the clock event device.
76 * @expires: absolute expiry time (monotonic clock) 86 * @expires: absolute expiry time (monotonic clock)
77 * 87 *
@@ -177,7 +187,7 @@ void clockevents_register_device(struct clock_event_device *dev)
177/* 187/*
178 * Noop handler when we shut down an event device 188 * Noop handler when we shut down an event device
179 */ 189 */
180static void clockevents_handle_noop(struct clock_event_device *dev) 190void clockevents_handle_noop(struct clock_event_device *dev)
181{ 191{
182} 192}
183 193
@@ -199,7 +209,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
199 * released list and do a notify add later. 209 * released list and do a notify add later.
200 */ 210 */
201 if (old) { 211 if (old) {
202 old->event_handler = clockevents_handle_noop;
203 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); 212 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
204 list_del(&old->list); 213 list_del(&old->list);
205 list_add(&old->list, &clockevents_released); 214 list_add(&old->list, &clockevents_released);
@@ -207,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
207 216
208 if (new) { 217 if (new) {
209 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); 218 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
210 clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); 219 clockevents_shutdown(new);
211 } 220 }
212 local_irq_restore(flags); 221 local_irq_restore(flags);
213} 222}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4c8d85421d24..9c114b726ab3 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -244,7 +244,7 @@ static void sync_cmos_clock(unsigned long dummy)
244 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) 244 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
245 fail = update_persistent_clock(now); 245 fail = update_persistent_clock(now);
246 246
247 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; 247 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
248 if (next.tv_nsec <= 0) 248 if (next.tv_nsec <= 0)
249 next.tv_nsec += NSEC_PER_SEC; 249 next.tv_nsec += NSEC_PER_SEC;
250 250
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 31463d370b94..cb01cd8f919b 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void)
175 */ 175 */
176static void tick_handle_periodic_broadcast(struct clock_event_device *dev) 176static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
177{ 177{
178 ktime_t next;
179
178 tick_do_periodic_broadcast(); 180 tick_do_periodic_broadcast();
179 181
180 /* 182 /*
@@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
185 187
186 /* 188 /*
187 * Setup the next period for devices, which do not have 189 * Setup the next period for devices, which do not have
188 * periodic mode: 190 * periodic mode. We read dev->next_event first and add to it
191 * when the event alrady expired. clockevents_program_event()
192 * sets dev->next_event only when the event is really
193 * programmed to the device.
189 */ 194 */
190 for (;;) { 195 for (next = dev->next_event; ;) {
191 ktime_t next = ktime_add(dev->next_event, tick_period); 196 next = ktime_add(next, tick_period);
192 197
193 if (!clockevents_program_event(dev, next, ktime_get())) 198 if (!clockevents_program_event(dev, next, ktime_get()))
194 return; 199 return;
@@ -205,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why)
205 struct clock_event_device *bc, *dev; 210 struct clock_event_device *bc, *dev;
206 struct tick_device *td; 211 struct tick_device *td;
207 unsigned long flags, *reason = why; 212 unsigned long flags, *reason = why;
208 int cpu; 213 int cpu, bc_stopped;
209 214
210 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 spin_lock_irqsave(&tick_broadcast_lock, flags);
211 216
@@ -223,14 +228,16 @@ static void tick_do_broadcast_on_off(void *why)
223 if (!tick_device_is_functional(dev)) 228 if (!tick_device_is_functional(dev))
224 goto out; 229 goto out;
225 230
231 bc_stopped = cpus_empty(tick_broadcast_mask);
232
226 switch (*reason) { 233 switch (*reason) {
227 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 234 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
228 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
229 if (!cpu_isset(cpu, tick_broadcast_mask)) { 236 if (!cpu_isset(cpu, tick_broadcast_mask)) {
230 cpu_set(cpu, tick_broadcast_mask); 237 cpu_set(cpu, tick_broadcast_mask);
231 if (td->mode == TICKDEV_MODE_PERIODIC) 238 if (tick_broadcast_device.mode ==
232 clockevents_set_mode(dev, 239 TICKDEV_MODE_PERIODIC)
233 CLOCK_EVT_MODE_SHUTDOWN); 240 clockevents_shutdown(dev);
234 } 241 }
235 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) 242 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
236 tick_broadcast_force = 1; 243 tick_broadcast_force = 1;
@@ -239,15 +246,17 @@ static void tick_do_broadcast_on_off(void *why)
239 if (!tick_broadcast_force && 246 if (!tick_broadcast_force &&
240 cpu_isset(cpu, tick_broadcast_mask)) { 247 cpu_isset(cpu, tick_broadcast_mask)) {
241 cpu_clear(cpu, tick_broadcast_mask); 248 cpu_clear(cpu, tick_broadcast_mask);
242 if (td->mode == TICKDEV_MODE_PERIODIC) 249 if (tick_broadcast_device.mode ==
250 TICKDEV_MODE_PERIODIC)
243 tick_setup_periodic(dev, 0); 251 tick_setup_periodic(dev, 0);
244 } 252 }
245 break; 253 break;
246 } 254 }
247 255
248 if (cpus_empty(tick_broadcast_mask)) 256 if (cpus_empty(tick_broadcast_mask)) {
249 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 257 if (!bc_stopped)
250 else { 258 clockevents_shutdown(bc);
259 } else if (bc_stopped) {
251 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 260 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
252 tick_broadcast_start_periodic(bc); 261 tick_broadcast_start_periodic(bc);
253 else 262 else
@@ -298,7 +307,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
298 307
299 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
300 if (bc && cpus_empty(tick_broadcast_mask)) 309 if (bc && cpus_empty(tick_broadcast_mask))
301 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 310 clockevents_shutdown(bc);
302 } 311 }
303 312
304 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 313 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -313,7 +322,7 @@ void tick_suspend_broadcast(void)
313 322
314 bc = tick_broadcast_device.evtdev; 323 bc = tick_broadcast_device.evtdev;
315 if (bc) 324 if (bc)
316 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 325 clockevents_shutdown(bc);
317 326
318 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 327 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
319} 328}
@@ -364,16 +373,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void)
364static int tick_broadcast_set_event(ktime_t expires, int force) 373static int tick_broadcast_set_event(ktime_t expires, int force)
365{ 374{
366 struct clock_event_device *bc = tick_broadcast_device.evtdev; 375 struct clock_event_device *bc = tick_broadcast_device.evtdev;
367 ktime_t now = ktime_get(); 376
368 int res; 377 return tick_dev_program_event(bc, expires, force);
369
370 for(;;) {
371 res = clockevents_program_event(bc, expires, now);
372 if (!res || !force)
373 return res;
374 now = ktime_get();
375 expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
376 }
377} 378}
378 379
379int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 380int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -491,14 +492,52 @@ static void tick_broadcast_clear_oneshot(int cpu)
491 cpu_clear(cpu, tick_broadcast_oneshot_mask); 492 cpu_clear(cpu, tick_broadcast_oneshot_mask);
492} 493}
493 494
495static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires)
496{
497 struct tick_device *td;
498 int cpu;
499
500 for_each_cpu_mask_nr(cpu, *mask) {
501 td = &per_cpu(tick_cpu_device, cpu);
502 if (td->evtdev)
503 td->evtdev->next_event = expires;
504 }
505}
506
494/** 507/**
495 * tick_broadcast_setup_oneshot - setup the broadcast device 508 * tick_broadcast_setup_oneshot - setup the broadcast device
496 */ 509 */
497void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 510void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
498{ 511{
499 bc->event_handler = tick_handle_oneshot_broadcast; 512 /* Set it up only once ! */
500 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 513 if (bc->event_handler != tick_handle_oneshot_broadcast) {
501 bc->next_event.tv64 = KTIME_MAX; 514 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
515 int cpu = smp_processor_id();
516 cpumask_t mask;
517
518 bc->event_handler = tick_handle_oneshot_broadcast;
519 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
520
521 /* Take the do_timer update */
522 tick_do_timer_cpu = cpu;
523
524 /*
525 * We must be careful here. There might be other CPUs
526 * waiting for periodic broadcast. We need to set the
527 * oneshot_mask bits for those and program the
528 * broadcast device to fire.
529 */
530 mask = tick_broadcast_mask;
531 cpu_clear(cpu, mask);
532 cpus_or(tick_broadcast_oneshot_mask,
533 tick_broadcast_oneshot_mask, mask);
534
535 if (was_periodic && !cpus_empty(mask)) {
536 tick_broadcast_init_next_event(&mask, tick_next_period);
537 tick_broadcast_set_event(tick_next_period, 1);
538 } else
539 bc->next_event.tv64 = KTIME_MAX;
540 }
502} 541}
503 542
504/* 543/*
@@ -538,4 +577,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
538 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 577 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
539} 578}
540 579
580/*
581 * Check, whether the broadcast device is in one shot mode
582 */
583int tick_broadcast_oneshot_active(void)
584{
585 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
586}
587
541#endif 588#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 80c4336f4188..df12434b43ca 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
33 */ 33 */
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36int tick_do_timer_cpu __read_mostly = -1; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37DEFINE_SPINLOCK(tick_device_lock); 37DEFINE_SPINLOCK(tick_device_lock);
38 38
39/* 39/*
@@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
109 if (!tick_device_is_functional(dev)) 109 if (!tick_device_is_functional(dev))
110 return; 110 return;
111 111
112 if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { 112 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
113 !tick_broadcast_oneshot_active()) {
113 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); 114 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
114 } else { 115 } else {
115 unsigned long seq; 116 unsigned long seq;
@@ -148,7 +149,7 @@ static void tick_setup_device(struct tick_device *td,
148 * If no cpu took the do_timer update, assign it to 149 * If no cpu took the do_timer update, assign it to
149 * this cpu: 150 * this cpu:
150 */ 151 */
151 if (tick_do_timer_cpu == -1) { 152 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
152 tick_do_timer_cpu = cpu; 153 tick_do_timer_cpu = cpu;
153 tick_next_period = ktime_get(); 154 tick_next_period = ktime_get();
154 tick_period = ktime_set(0, NSEC_PER_SEC / HZ); 155 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
@@ -161,6 +162,7 @@ static void tick_setup_device(struct tick_device *td,
161 } else { 162 } else {
162 handler = td->evtdev->event_handler; 163 handler = td->evtdev->event_handler;
163 next_event = td->evtdev->next_event; 164 next_event = td->evtdev->next_event;
165 td->evtdev->event_handler = clockevents_handle_noop;
164 } 166 }
165 167
166 td->evtdev = newdev; 168 td->evtdev = newdev;
@@ -248,7 +250,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
248 * not give it back to the clockevents layer ! 250 * not give it back to the clockevents layer !
249 */ 251 */
250 if (tick_is_broadcast_device(curdev)) { 252 if (tick_is_broadcast_device(curdev)) {
251 clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); 253 clockevents_shutdown(curdev);
252 curdev = NULL; 254 curdev = NULL;
253 } 255 }
254 clockevents_exchange_device(curdev, newdev); 256 clockevents_exchange_device(curdev, newdev);
@@ -299,7 +301,8 @@ static void tick_shutdown(unsigned int *cpup)
299 if (*cpup == tick_do_timer_cpu) { 301 if (*cpup == tick_do_timer_cpu) {
300 int cpu = first_cpu(cpu_online_map); 302 int cpu = first_cpu(cpu_online_map);
301 303
302 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; 304 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
305 TICK_DO_TIMER_NONE;
303 } 306 }
304 spin_unlock_irqrestore(&tick_device_lock, flags); 307 spin_unlock_irqrestore(&tick_device_lock, flags);
305} 308}
@@ -310,7 +313,7 @@ static void tick_suspend(void)
310 unsigned long flags; 313 unsigned long flags;
311 314
312 spin_lock_irqsave(&tick_device_lock, flags); 315 spin_lock_irqsave(&tick_device_lock, flags);
313 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN); 316 clockevents_shutdown(td->evtdev);
314 spin_unlock_irqrestore(&tick_device_lock, flags); 317 spin_unlock_irqrestore(&tick_device_lock, flags);
315} 318}
316 319
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f13f2b7f4fd4..469248782c23 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
1/* 1/*
2 * tick internal variable and functions used by low/high res code 2 * tick internal variable and functions used by low/high res code
3 */ 3 */
4
5#define TICK_DO_TIMER_NONE -1
6#define TICK_DO_TIMER_BOOT -2
7
4DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 8DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
5extern spinlock_t tick_device_lock; 9extern spinlock_t tick_device_lock;
6extern ktime_t tick_next_period; 10extern ktime_t tick_next_period;
@@ -10,6 +14,8 @@ extern int tick_do_timer_cpu __read_mostly;
10extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); 14extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
11extern void tick_handle_periodic(struct clock_event_device *dev); 15extern void tick_handle_periodic(struct clock_event_device *dev);
12 16
17extern void clockevents_shutdown(struct clock_event_device *dev);
18
13/* 19/*
14 * NO_HZ / high resolution timer shared code 20 * NO_HZ / high resolution timer shared code
15 */ 21 */
@@ -17,6 +23,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev);
17extern void tick_setup_oneshot(struct clock_event_device *newdev, 23extern void tick_setup_oneshot(struct clock_event_device *newdev,
18 void (*handler)(struct clock_event_device *), 24 void (*handler)(struct clock_event_device *),
19 ktime_t nextevt); 25 ktime_t nextevt);
26extern int tick_dev_program_event(struct clock_event_device *dev,
27 ktime_t expires, int force);
20extern int tick_program_event(ktime_t expires, int force); 28extern int tick_program_event(ktime_t expires, int force);
21extern void tick_oneshot_notify(void); 29extern void tick_oneshot_notify(void);
22extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); 30extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
@@ -27,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason);
27extern void tick_broadcast_switch_to_oneshot(void); 35extern void tick_broadcast_switch_to_oneshot(void);
28extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 36extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
29extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 37extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
38extern int tick_broadcast_oneshot_active(void);
30# else /* BROADCAST */ 39# else /* BROADCAST */
31static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
32{ 41{
@@ -35,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
35static inline void tick_broadcast_oneshot_control(unsigned long reason) { } 44static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
36static inline void tick_broadcast_switch_to_oneshot(void) { } 45static inline void tick_broadcast_switch_to_oneshot(void) { }
37static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 46static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
47static inline int tick_broadcast_oneshot_active(void) { return 0; }
38# endif /* !BROADCAST */ 48# endif /* !BROADCAST */
39 49
40#else /* !ONESHOT */ 50#else /* !ONESHOT */
@@ -64,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
64{ 74{
65 return 0; 75 return 0;
66} 76}
77static inline int tick_broadcast_oneshot_active(void) { return 0; }
67#endif /* !TICK_ONESHOT */ 78#endif /* !TICK_ONESHOT */
68 79
69/* 80/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 450c04935b66..2e8de678e767 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -23,24 +23,56 @@
23#include "tick-internal.h" 23#include "tick-internal.h"
24 24
25/** 25/**
26 * tick_program_event 26 * tick_program_event internal worker function
27 */ 27 */
28int tick_program_event(ktime_t expires, int force) 28int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
29 int force)
29{ 30{
30 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
31 ktime_t now = ktime_get(); 31 ktime_t now = ktime_get();
32 int i;
32 33
33 while (1) { 34 for (i = 0;;) {
34 int ret = clockevents_program_event(dev, expires, now); 35 int ret = clockevents_program_event(dev, expires, now);
35 36
36 if (!ret || !force) 37 if (!ret || !force)
37 return ret; 38 return ret;
39
40 /*
41 * We tried 2 times to program the device with the given
42 * min_delta_ns. If that's not working then we double it
43 * and emit a warning.
44 */
45 if (++i > 2) {
46 /* Increase the min. delta and try again */
47 if (!dev->min_delta_ns)
48 dev->min_delta_ns = 5000;
49 else
50 dev->min_delta_ns += dev->min_delta_ns >> 1;
51
52 printk(KERN_WARNING
53 "CE: %s increasing min_delta_ns to %lu nsec\n",
54 dev->name ? dev->name : "?",
55 dev->min_delta_ns << 1);
56
57 i = 0;
58 }
59
38 now = ktime_get(); 60 now = ktime_get();
39 expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); 61 expires = ktime_add_ns(now, dev->min_delta_ns);
40 } 62 }
41} 63}
42 64
43/** 65/**
66 * tick_program_event
67 */
68int tick_program_event(ktime_t expires, int force)
69{
70 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
71
72 return tick_dev_program_event(dev, expires, force);
73}
74
75/**
44 * tick_resume_onshot - resume oneshot mode 76 * tick_resume_onshot - resume oneshot mode
45 */ 77 */
46void tick_resume_oneshot(void) 78void tick_resume_oneshot(void)
@@ -61,7 +93,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
61{ 93{
62 newdev->event_handler = handler; 94 newdev->event_handler = handler;
63 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); 95 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
64 clockevents_program_event(newdev, next_event, ktime_get()); 96 tick_dev_program_event(newdev, next_event, 1);
65} 97}
66 98
67/** 99/**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b33be61c0f6b..a547be11cf97 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/tick.h> 22#include <linux/tick.h>
23#include <linux/module.h>
23 24
24#include <asm/irq_regs.h> 25#include <asm/irq_regs.h>
25 26
@@ -75,6 +76,9 @@ static void tick_do_update_jiffies64(ktime_t now)
75 incr * ticks); 76 incr * ticks);
76 } 77 }
77 do_timer(++ticks); 78 do_timer(++ticks);
79
80 /* Keep the tick_next_period variable up to date */
81 tick_next_period = ktime_add(last_jiffies_update, tick_period);
78 } 82 }
79 write_sequnlock(&xtime_lock); 83 write_sequnlock(&xtime_lock);
80} 84}
@@ -187,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
187{ 191{
188 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 192 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
189 193
190 *last_update_time = ktime_to_us(ts->idle_lastupdate); 194 if (!tick_nohz_enabled)
195 return -1;
196
197 if (ts->idle_active)
198 *last_update_time = ktime_to_us(ts->idle_lastupdate);
199 else
200 *last_update_time = ktime_to_us(ktime_get());
201
191 return ktime_to_us(ts->idle_sleeptime); 202 return ktime_to_us(ts->idle_sleeptime);
192} 203}
204EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
193 205
194/** 206/**
195 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 207 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
@@ -221,7 +233,7 @@ void tick_nohz_stop_sched_tick(int inidle)
221 */ 233 */
222 if (unlikely(!cpu_online(cpu))) { 234 if (unlikely(!cpu_online(cpu))) {
223 if (cpu == tick_do_timer_cpu) 235 if (cpu == tick_do_timer_cpu)
224 tick_do_timer_cpu = -1; 236 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
225 } 237 }
226 238
227 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 239 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
@@ -258,7 +270,7 @@ void tick_nohz_stop_sched_tick(int inidle)
258 next_jiffies = get_next_timer_interrupt(last_jiffies); 270 next_jiffies = get_next_timer_interrupt(last_jiffies);
259 delta_jiffies = next_jiffies - last_jiffies; 271 delta_jiffies = next_jiffies - last_jiffies;
260 272
261 if (rcu_needs_cpu(cpu)) 273 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
262 delta_jiffies = 1; 274 delta_jiffies = 1;
263 /* 275 /*
264 * Do not stop the tick, if we are only one off 276 * Do not stop the tick, if we are only one off
@@ -303,7 +315,7 @@ void tick_nohz_stop_sched_tick(int inidle)
303 * invoked. 315 * invoked.
304 */ 316 */
305 if (cpu == tick_do_timer_cpu) 317 if (cpu == tick_do_timer_cpu)
306 tick_do_timer_cpu = -1; 318 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
307 319
308 ts->idle_sleeps++; 320 ts->idle_sleeps++;
309 321
@@ -468,7 +480,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
468 * this duty, then the jiffies update is still serialized by 480 * this duty, then the jiffies update is still serialized by
469 * xtime_lock. 481 * xtime_lock.
470 */ 482 */
471 if (unlikely(tick_do_timer_cpu == -1)) 483 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
472 tick_do_timer_cpu = cpu; 484 tick_do_timer_cpu = cpu;
473 485
474 /* Check, if the jiffies need an update */ 486 /* Check, if the jiffies need an update */
@@ -570,7 +582,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
570 * this duty, then the jiffies update is still serialized by 582 * this duty, then the jiffies update is still serialized by
571 * xtime_lock. 583 * xtime_lock.
572 */ 584 */
573 if (unlikely(tick_do_timer_cpu == -1)) 585 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
574 tick_do_timer_cpu = cpu; 586 tick_do_timer_cpu = cpu;
575#endif 587#endif
576 588
@@ -622,7 +634,7 @@ void tick_setup_sched_timer(void)
622 */ 634 */
623 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 635 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
624 ts->sched_timer.function = tick_sched_timer; 636 ts->sched_timer.function = tick_sched_timer;
625 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 637 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
626 638
627 /* Get the next period (per cpu) */ 639 /* Get the next period (per cpu) */
628 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 640 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
diff --git a/kernel/timer.c b/kernel/timer.c
index 03bc7f1f1593..510fe69351ca 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -978,6 +978,7 @@ void update_process_times(int user_tick)
978 run_local_timers(); 978 run_local_timers();
979 if (rcu_pending(cpu)) 979 if (rcu_pending(cpu))
980 rcu_check_callbacks(cpu, user_tick); 980 rcu_check_callbacks(cpu, user_tick);
981 printk_tick();
981 scheduler_tick(); 982 scheduler_tick();
982 run_posix_cpu_timers(p); 983 run_posix_cpu_timers(p);
983} 984}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index bb948e52ce20..db58fb66a135 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,7 @@ static void start_stack_timer(int cpu)
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
206 206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208} 208}
diff --git a/kernel/user.c b/kernel/user.c
index 865ecf57a096..39d6159fae43 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169{ 169{
170 struct user_struct *up = container_of(kobj, struct user_struct, kobj); 170 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
171 171
172 return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); 172 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
173} 173}
174 174
175static ssize_t cpu_rt_runtime_store(struct kobject *kobj, 175static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
180 unsigned long rt_runtime; 180 unsigned long rt_runtime;
181 int rc; 181 int rc;
182 182
183 sscanf(buf, "%lu", &rt_runtime); 183 sscanf(buf, "%ld", &rt_runtime);
184 184
185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime); 185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
186 186
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 4ab9659d269e..3b34b3545936 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -60,7 +60,7 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
60 60
61#ifdef CONFIG_SYSCTL_SYSCALL 61#ifdef CONFIG_SYSCTL_SYSCALL
62/* The generic string strategy routine: */ 62/* The generic string strategy routine: */
63static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, 63static int sysctl_uts_string(ctl_table *table,
64 void __user *oldval, size_t __user *oldlenp, 64 void __user *oldval, size_t __user *oldlenp,
65 void __user *newval, size_t newlen) 65 void __user *newval, size_t newlen)
66{ 66{
@@ -69,8 +69,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
69 write = newval && newlen; 69 write = newval && newlen;
70 memcpy(&uts_table, table, sizeof(uts_table)); 70 memcpy(&uts_table, table, sizeof(uts_table));
71 uts_table.data = get_uts(table, write); 71 uts_table.data = get_uts(table, write);
72 r = sysctl_string(&uts_table, name, nlen, 72 r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
73 oldval, oldlenp, newval, newlen);
74 put_uts(table, write, uts_table.data); 73 put_uts(table, write, uts_table.data);
75 return r; 74 return r;
76} 75}
diff --git a/kernel/wait.c b/kernel/wait.c
index c275c56cf2d3..cd87131f2fc2 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -72,12 +72,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
72 spin_lock_irqsave(&q->lock, flags); 72 spin_lock_irqsave(&q->lock, flags);
73 if (list_empty(&wait->task_list)) 73 if (list_empty(&wait->task_list))
74 __add_wait_queue(q, wait); 74 __add_wait_queue(q, wait);
75 /* 75 set_current_state(state);
76 * don't alter the task state if this is just going to
77 * queue an async wait queue callback
78 */
79 if (is_sync_wait(wait))
80 set_current_state(state);
81 spin_unlock_irqrestore(&q->lock, flags); 76 spin_unlock_irqrestore(&q->lock, flags);
82} 77}
83EXPORT_SYMBOL(prepare_to_wait); 78EXPORT_SYMBOL(prepare_to_wait);
@@ -91,12 +86,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
91 spin_lock_irqsave(&q->lock, flags); 86 spin_lock_irqsave(&q->lock, flags);
92 if (list_empty(&wait->task_list)) 87 if (list_empty(&wait->task_list))
93 __add_wait_queue_tail(q, wait); 88 __add_wait_queue_tail(q, wait);
94 /* 89 set_current_state(state);
95 * don't alter the task state if this is just going to
96 * queue an async wait queue callback
97 */
98 if (is_sync_wait(wait))
99 set_current_state(state);
100 spin_unlock_irqrestore(&q->lock, flags); 90 spin_unlock_irqrestore(&q->lock, flags);
101} 91}
102EXPORT_SYMBOL(prepare_to_wait_exclusive); 92EXPORT_SYMBOL(prepare_to_wait_exclusive);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4048e92aa04f..714afad46539 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -9,7 +9,7 @@
9 * Derived from the taskqueue/keventd code by: 9 * Derived from the taskqueue/keventd code by:
10 * 10 *
11 * David Woodhouse <dwmw2@infradead.org> 11 * David Woodhouse <dwmw2@infradead.org>
12 * Andrew Morton <andrewm@uow.edu.au> 12 * Andrew Morton
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de> 13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu> 14 * Theodore Ts'o <tytso@mit.edu>
15 * 15 *