aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c5
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/cpuset.c351
-rw-r--r--kernel/exit.c12
-rw-r--r--kernel/hrtimer.c95
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kgdb.c13
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/sched.c394
-rw-r--r--kernel/sched_fair.c222
-rw-r--r--kernel/sched_features.h1
-rw-r--r--kernel/sched_idletask.c6
-rw-r--r--kernel/sched_rt.c58
-rw-r--r--kernel/time/clockevents.c12
-rw-r--r--kernel/time/tick-broadcast.c23
-rw-r--r--kernel/time/tick-common.c14
-rw-r--r--kernel/time/tick-internal.h9
-rw-r--r--kernel/time/tick-oneshot.c18
-rw-r--r--kernel/time/tick-sched.c13
-rw-r--r--kernel/trace/trace_sysprof.c2
-rw-r--r--kernel/user.c4
21 files changed, 735 insertions, 532 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 13932abde159..a0123d75ec9a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2738,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child)
2738 */ 2738 */
2739void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) 2739void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2740{ 2740{
2741 struct cgroup *oldcgrp, *newcgrp; 2741 struct cgroup *oldcgrp, *newcgrp = NULL;
2742 2742
2743 if (need_mm_owner_callback) { 2743 if (need_mm_owner_callback) {
2744 int i; 2744 int i;
2745 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2745 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2746 struct cgroup_subsys *ss = subsys[i]; 2746 struct cgroup_subsys *ss = subsys[i];
2747 oldcgrp = task_cgroup(old, ss->subsys_id); 2747 oldcgrp = task_cgroup(old, ss->subsys_id);
2748 newcgrp = task_cgroup(new, ss->subsys_id); 2748 if (new)
2749 newcgrp = task_cgroup(new, ss->subsys_id);
2749 if (oldcgrp == newcgrp) 2750 if (oldcgrp == newcgrp)
2750 continue; 2751 continue;
2751 if (ss->mm_owner_changed) 2752 if (ss->mm_owner_changed)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index dc45f2459efb..86d49045daed 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
199 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
200 int err; 200 int err;
201 201
202 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
203 param->hcpu);
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
205 err = __cpu_disable(); 203 err = __cpu_disable();
206 if (err < 0) 204 if (err < 0)
207 return err; 205 return err;
208 206
207 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
208 param->hcpu);
209
209 /* Force idle task to run as soon as we yield: it should 210 /* Force idle task to run as soon as we yield: it should
210 immediately notice cpu is offline and die quickly. */ 211 immediately notice cpu is offline and die quickly. */
211 sched_idle_next(); 212 sched_idle_next();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5ab79cf516d..eab7bd6628e0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
14 * 2003-10-22 Updates by Stephen Hemminger. 14 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson. 15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups 16 * 2006 Rework by Paul Menage to use generic cgroups
17 * 2008 Rework of the scheduler domains and CPU hotplug handling
18 * by Max Krasnyansky
17 * 19 *
18 * This file is subject to the terms and conditions of the GNU General Public 20 * This file is subject to the terms and conditions of the GNU General Public
19 * License. See the file COPYING in the main directory of the Linux 21 * License. See the file COPYING in the main directory of the Linux
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = {
236 238
237static DEFINE_MUTEX(callback_mutex); 239static DEFINE_MUTEX(callback_mutex);
238 240
239/* This is ugly, but preserves the userspace API for existing cpuset 241/*
242 * This is ugly, but preserves the userspace API for existing cpuset
240 * users. If someone tries to mount the "cpuset" filesystem, we 243 * users. If someone tries to mount the "cpuset" filesystem, we
241 * silently switch it to mount "cgroup" instead */ 244 * silently switch it to mount "cgroup" instead
245 */
242static int cpuset_get_sb(struct file_system_type *fs_type, 246static int cpuset_get_sb(struct file_system_type *fs_type,
243 int flags, const char *unused_dev_name, 247 int flags, const char *unused_dev_name,
244 void *data, struct vfsmount *mnt) 248 void *data, struct vfsmount *mnt)
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
473} 477}
474 478
475/* 479/*
476 * Helper routine for rebuild_sched_domains(). 480 * Helper routine for generate_sched_domains().
477 * Do cpusets a, b have overlapping cpus_allowed masks? 481 * Do cpusets a, b have overlapping cpus_allowed masks?
478 */ 482 */
479
480static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 483static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
481{ 484{
482 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 485 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
518} 521}
519 522
520/* 523/*
521 * rebuild_sched_domains() 524 * generate_sched_domains()
522 * 525 *
523 * This routine will be called to rebuild the scheduler's dynamic 526 * This function builds a partial partition of the systems CPUs
524 * sched domains: 527 * A 'partial partition' is a set of non-overlapping subsets whose
525 * - if the flag 'sched_load_balance' of any cpuset with non-empty 528 * union is a subset of that set.
526 * 'cpus' changes, 529 * The output of this function needs to be passed to kernel/sched.c
527 * - or if the 'cpus' allowed changes in any cpuset which has that 530 * partition_sched_domains() routine, which will rebuild the scheduler's
528 * flag enabled, 531 * load balancing domains (sched domains) as specified by that partial
529 * - or if the 'sched_relax_domain_level' of any cpuset which has 532 * partition.
530 * that flag enabled and with non-empty 'cpus' changes,
531 * - or if any cpuset with non-empty 'cpus' is removed,
532 * - or if a cpu gets offlined.
533 *
534 * This routine builds a partial partition of the systems CPUs
535 * (the set of non-overlappping cpumask_t's in the array 'part'
536 * below), and passes that partial partition to the kernel/sched.c
537 * partition_sched_domains() routine, which will rebuild the
538 * schedulers load balancing domains (sched domains) as specified
539 * by that partial partition. A 'partial partition' is a set of
540 * non-overlapping subsets whose union is a subset of that set.
541 * 533 *
542 * See "What is sched_load_balance" in Documentation/cpusets.txt 534 * See "What is sched_load_balance" in Documentation/cpusets.txt
543 * for a background explanation of this. 535 * for a background explanation of this.
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
547 * domains when operating in the severe memory shortage situations 539 * domains when operating in the severe memory shortage situations
548 * that could cause allocation failures below. 540 * that could cause allocation failures below.
549 * 541 *
550 * Call with cgroup_mutex held. May take callback_mutex during 542 * Must be called with cgroup_lock held.
551 * call due to the kfifo_alloc() and kmalloc() calls. May nest
552 * a call to the get_online_cpus()/put_online_cpus() pair.
553 * Must not be called holding callback_mutex, because we must not
554 * call get_online_cpus() while holding callback_mutex. Elsewhere
555 * the kernel nests callback_mutex inside get_online_cpus() calls.
556 * So the reverse nesting would risk an ABBA deadlock.
557 * 543 *
558 * The three key local variables below are: 544 * The three key local variables below are:
559 * q - a linked-list queue of cpuset pointers, used to implement a 545 * q - a linked-list queue of cpuset pointers, used to implement a
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
588 * element of the partition (one sched domain) to be passed to 574 * element of the partition (one sched domain) to be passed to
589 * partition_sched_domains(). 575 * partition_sched_domains().
590 */ 576 */
591 577static int generate_sched_domains(cpumask_t **domains,
592void rebuild_sched_domains(void) 578 struct sched_domain_attr **attributes)
593{ 579{
594 LIST_HEAD(q); /* queue of cpusets to be scanned*/ 580 LIST_HEAD(q); /* queue of cpusets to be scanned */
595 struct cpuset *cp; /* scans q */ 581 struct cpuset *cp; /* scans q */
596 struct cpuset **csa; /* array of all cpuset ptrs */ 582 struct cpuset **csa; /* array of all cpuset ptrs */
597 int csn; /* how many cpuset ptrs in csa so far */ 583 int csn; /* how many cpuset ptrs in csa so far */
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void)
601 int ndoms; /* number of sched domains in result */ 587 int ndoms; /* number of sched domains in result */
602 int nslot; /* next empty doms[] cpumask_t slot */ 588 int nslot; /* next empty doms[] cpumask_t slot */
603 589
604 csa = NULL; 590 ndoms = 0;
605 doms = NULL; 591 doms = NULL;
606 dattr = NULL; 592 dattr = NULL;
593 csa = NULL;
607 594
608 /* Special case for the 99% of systems with one, full, sched domain */ 595 /* Special case for the 99% of systems with one, full, sched domain */
609 if (is_sched_load_balance(&top_cpuset)) { 596 if (is_sched_load_balance(&top_cpuset)) {
610 ndoms = 1;
611 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
612 if (!doms) 598 if (!doms)
613 goto rebuild; 599 goto done;
600
614 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 601 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
615 if (dattr) { 602 if (dattr) {
616 *dattr = SD_ATTR_INIT; 603 *dattr = SD_ATTR_INIT;
617 update_domain_attr_tree(dattr, &top_cpuset); 604 update_domain_attr_tree(dattr, &top_cpuset);
618 } 605 }
619 *doms = top_cpuset.cpus_allowed; 606 *doms = top_cpuset.cpus_allowed;
620 goto rebuild; 607
608 ndoms = 1;
609 goto done;
621 } 610 }
622 611
623 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 612 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
@@ -680,61 +669,141 @@ restart:
680 } 669 }
681 } 670 }
682 671
683 /* Convert <csn, csa> to <ndoms, doms> */ 672 /*
673 * Now we know how many domains to create.
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */
684 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
685 if (!doms) 677 if (!doms) {
686 goto rebuild; 678 ndoms = 0;
679 goto done;
680 }
681
682 /*
683 * The rest of the code, including the scheduler, can deal with
684 * dattr==NULL case. No need to abort if alloc fails.
685 */
687 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); 686 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
688 687
689 for (nslot = 0, i = 0; i < csn; i++) { 688 for (nslot = 0, i = 0; i < csn; i++) {
690 struct cpuset *a = csa[i]; 689 struct cpuset *a = csa[i];
690 cpumask_t *dp;
691 int apn = a->pn; 691 int apn = a->pn;
692 692
693 if (apn >= 0) { 693 if (apn < 0) {
694 cpumask_t *dp = doms + nslot; 694 /* Skip completed partitions */
695 695 continue;
696 if (nslot == ndoms) { 696 }
697 static int warnings = 10; 697
698 if (warnings) { 698 dp = doms + nslot;
699 printk(KERN_WARNING 699
700 "rebuild_sched_domains confused:" 700 if (nslot == ndoms) {
701 " nslot %d, ndoms %d, csn %d, i %d," 701 static int warnings = 10;
702 " apn %d\n", 702 if (warnings) {
703 nslot, ndoms, csn, i, apn); 703 printk(KERN_WARNING
704 warnings--; 704 "rebuild_sched_domains confused:"
705 } 705 " nslot %d, ndoms %d, csn %d, i %d,"
706 continue; 706 " apn %d\n",
707 nslot, ndoms, csn, i, apn);
708 warnings--;
707 } 709 }
710 continue;
711 }
708 712
709 cpus_clear(*dp); 713 cpus_clear(*dp);
710 if (dattr) 714 if (dattr)
711 *(dattr + nslot) = SD_ATTR_INIT; 715 *(dattr + nslot) = SD_ATTR_INIT;
712 for (j = i; j < csn; j++) { 716 for (j = i; j < csn; j++) {
713 struct cpuset *b = csa[j]; 717 struct cpuset *b = csa[j];
714 718
715 if (apn == b->pn) { 719 if (apn == b->pn) {
716 cpus_or(*dp, *dp, b->cpus_allowed); 720 cpus_or(*dp, *dp, b->cpus_allowed);
717 b->pn = -1; 721 if (dattr)
718 if (dattr) 722 update_domain_attr_tree(dattr + nslot, b);
719 update_domain_attr_tree(dattr 723
720 + nslot, b); 724 /* Done with this partition */
721 } 725 b->pn = -1;
722 } 726 }
723 nslot++;
724 } 727 }
728 nslot++;
725 } 729 }
726 BUG_ON(nslot != ndoms); 730 BUG_ON(nslot != ndoms);
727 731
728rebuild: 732done:
729 /* Have scheduler rebuild sched domains */ 733 kfree(csa);
734
735 *domains = doms;
736 *attributes = dattr;
737 return ndoms;
738}
739
740/*
741 * Rebuild scheduler domains.
742 *
743 * Call with neither cgroup_mutex held nor within get_online_cpus().
744 * Takes both cgroup_mutex and get_online_cpus().
745 *
746 * Cannot be directly called from cpuset code handling changes
747 * to the cpuset pseudo-filesystem, because it cannot be called
748 * from code that already holds cgroup_mutex.
749 */
750static void do_rebuild_sched_domains(struct work_struct *unused)
751{
752 struct sched_domain_attr *attr;
753 cpumask_t *doms;
754 int ndoms;
755
730 get_online_cpus(); 756 get_online_cpus();
731 partition_sched_domains(ndoms, doms, dattr); 757
758 /* Generate domain masks and attrs */
759 cgroup_lock();
760 ndoms = generate_sched_domains(&doms, &attr);
761 cgroup_unlock();
762
763 /* Have scheduler rebuild the domains */
764 partition_sched_domains(ndoms, doms, attr);
765
732 put_online_cpus(); 766 put_online_cpus();
767}
733 768
734done: 769static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
735 kfree(csa); 770
736 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 771/*
737 /* Don't kfree(dattr) -- partition_sched_domains() does that. */ 772 * Rebuild scheduler domains, asynchronously via workqueue.
773 *
774 * If the flag 'sched_load_balance' of any cpuset with non-empty
775 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
776 * which has that flag enabled, or if any cpuset with a non-empty
777 * 'cpus' is removed, then call this routine to rebuild the
778 * scheduler's dynamic sched domains.
779 *
780 * The rebuild_sched_domains() and partition_sched_domains()
781 * routines must nest cgroup_lock() inside get_online_cpus(),
782 * but such cpuset changes as these must nest that locking the
783 * other way, holding cgroup_lock() for much of the code.
784 *
785 * So in order to avoid an ABBA deadlock, the cpuset code handling
786 * these user changes delegates the actual sched domain rebuilding
787 * to a separate workqueue thread, which ends up processing the
788 * above do_rebuild_sched_domains() function.
789 */
790static void async_rebuild_sched_domains(void)
791{
792 schedule_work(&rebuild_sched_domains_work);
793}
794
795/*
796 * Accomplishes the same scheduler domain rebuild as the above
797 * async_rebuild_sched_domains(), however it directly calls the
798 * rebuild routine synchronously rather than calling it via an
799 * asynchronous work thread.
800 *
801 * This can only be called from code that is not holding
802 * cgroup_mutex (not nested in a cgroup_lock() call.)
803 */
804void rebuild_sched_domains(void)
805{
806 do_rebuild_sched_domains(NULL);
738} 807}
739 808
740/** 809/**
@@ -774,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
774/** 843/**
775 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 844 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
776 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 845 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
846 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
777 * 847 *
778 * Called with cgroup_mutex held 848 * Called with cgroup_mutex held
779 * 849 *
780 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 850 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
781 * calling callback functions for each. 851 * calling callback functions for each.
782 * 852 *
783 * Return 0 if successful, -errno if not. 853 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
854 * if @heap != NULL.
784 */ 855 */
785static int update_tasks_cpumask(struct cpuset *cs) 856static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
786{ 857{
787 struct cgroup_scanner scan; 858 struct cgroup_scanner scan;
788 struct ptr_heap heap;
789 int retval;
790
791 /*
792 * cgroup_scan_tasks() will initialize heap->gt for us.
793 * heap_init() is still needed here for we should not change
794 * cs->cpus_allowed when heap_init() fails.
795 */
796 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
797 if (retval)
798 return retval;
799 859
800 scan.cg = cs->css.cgroup; 860 scan.cg = cs->css.cgroup;
801 scan.test_task = cpuset_test_cpumask; 861 scan.test_task = cpuset_test_cpumask;
802 scan.process_task = cpuset_change_cpumask; 862 scan.process_task = cpuset_change_cpumask;
803 scan.heap = &heap; 863 scan.heap = heap;
804 retval = cgroup_scan_tasks(&scan); 864 cgroup_scan_tasks(&scan);
805
806 heap_free(&heap);
807 return retval;
808} 865}
809 866
810/** 867/**
@@ -814,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs)
814 */ 871 */
815static int update_cpumask(struct cpuset *cs, const char *buf) 872static int update_cpumask(struct cpuset *cs, const char *buf)
816{ 873{
874 struct ptr_heap heap;
817 struct cpuset trialcs; 875 struct cpuset trialcs;
818 int retval; 876 int retval;
819 int is_load_balanced; 877 int is_load_balanced;
@@ -848,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
848 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 906 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
849 return 0; 907 return 0;
850 908
909 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
910 if (retval)
911 return retval;
912
851 is_load_balanced = is_sched_load_balance(&trialcs); 913 is_load_balanced = is_sched_load_balance(&trialcs);
852 914
853 mutex_lock(&callback_mutex); 915 mutex_lock(&callback_mutex);
@@ -858,12 +920,12 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
858 * Scan tasks in the cpuset, and update the cpumasks of any 920 * Scan tasks in the cpuset, and update the cpumasks of any
859 * that need an update. 921 * that need an update.
860 */ 922 */
861 retval = update_tasks_cpumask(cs); 923 update_tasks_cpumask(cs, &heap);
862 if (retval < 0) 924
863 return retval; 925 heap_free(&heap);
864 926
865 if (is_load_balanced) 927 if (is_load_balanced)
866 rebuild_sched_domains(); 928 async_rebuild_sched_domains();
867 return 0; 929 return 0;
868} 930}
869 931
@@ -1090,7 +1152,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1090 if (val != cs->relax_domain_level) { 1152 if (val != cs->relax_domain_level) {
1091 cs->relax_domain_level = val; 1153 cs->relax_domain_level = val;
1092 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) 1154 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
1093 rebuild_sched_domains(); 1155 async_rebuild_sched_domains();
1094 } 1156 }
1095 1157
1096 return 0; 1158 return 0;
@@ -1131,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1131 mutex_unlock(&callback_mutex); 1193 mutex_unlock(&callback_mutex);
1132 1194
1133 if (cpus_nonempty && balance_flag_changed) 1195 if (cpus_nonempty && balance_flag_changed)
1134 rebuild_sched_domains(); 1196 async_rebuild_sched_domains();
1135 1197
1136 return 0; 1198 return 0;
1137} 1199}
@@ -1492,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1492 default: 1554 default:
1493 BUG(); 1555 BUG();
1494 } 1556 }
1557
1558 /* Unreachable but makes gcc happy */
1559 return 0;
1495} 1560}
1496 1561
1497static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1562static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1504,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1504 default: 1569 default:
1505 BUG(); 1570 BUG();
1506 } 1571 }
1572
1573 /* Unrechable but makes gcc happy */
1574 return 0;
1507} 1575}
1508 1576
1509 1577
@@ -1692,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create(
1692} 1760}
1693 1761
1694/* 1762/*
1695 * Locking note on the strange update_flag() call below:
1696 *
1697 * If the cpuset being removed has its flag 'sched_load_balance' 1763 * If the cpuset being removed has its flag 'sched_load_balance'
1698 * enabled, then simulate turning sched_load_balance off, which 1764 * enabled, then simulate turning sched_load_balance off, which
1699 * will call rebuild_sched_domains(). The get_online_cpus() 1765 * will call async_rebuild_sched_domains().
1700 * call in rebuild_sched_domains() must not be made while holding
1701 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1702 * get_online_cpus() calls. So the reverse nesting would risk an
1703 * ABBA deadlock.
1704 */ 1766 */
1705 1767
1706static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1768static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1719,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1719struct cgroup_subsys cpuset_subsys = { 1781struct cgroup_subsys cpuset_subsys = {
1720 .name = "cpuset", 1782 .name = "cpuset",
1721 .create = cpuset_create, 1783 .create = cpuset_create,
1722 .destroy = cpuset_destroy, 1784 .destroy = cpuset_destroy,
1723 .can_attach = cpuset_can_attach, 1785 .can_attach = cpuset_can_attach,
1724 .attach = cpuset_attach, 1786 .attach = cpuset_attach,
1725 .populate = cpuset_populate, 1787 .populate = cpuset_populate,
@@ -1811,7 +1873,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1811} 1873}
1812 1874
1813/* 1875/*
1814 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1876 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
1815 * or memory nodes, we need to walk over the cpuset hierarchy, 1877 * or memory nodes, we need to walk over the cpuset hierarchy,
1816 * removing that CPU or node from all cpusets. If this removes the 1878 * removing that CPU or node from all cpusets. If this removes the
1817 * last CPU or node from a cpuset, then move the tasks in the empty 1879 * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1859,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1859 * that has tasks along with an empty 'mems'. But if we did see such 1921 * that has tasks along with an empty 'mems'. But if we did see such
1860 * a cpuset, we'd handle it just like we do if its 'cpus' was empty. 1922 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1861 */ 1923 */
1862static void scan_for_empty_cpusets(const struct cpuset *root) 1924static void scan_for_empty_cpusets(struct cpuset *root)
1863{ 1925{
1864 LIST_HEAD(queue); 1926 LIST_HEAD(queue);
1865 struct cpuset *cp; /* scans cpusets being updated */ 1927 struct cpuset *cp; /* scans cpusets being updated */
@@ -1896,42 +1958,13 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1896 nodes_empty(cp->mems_allowed)) 1958 nodes_empty(cp->mems_allowed))
1897 remove_tasks_in_empty_cpuset(cp); 1959 remove_tasks_in_empty_cpuset(cp);
1898 else { 1960 else {
1899 update_tasks_cpumask(cp); 1961 update_tasks_cpumask(cp, NULL);
1900 update_tasks_nodemask(cp, &oldmems); 1962 update_tasks_nodemask(cp, &oldmems);
1901 } 1963 }
1902 } 1964 }
1903} 1965}
1904 1966
1905/* 1967/*
1906 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1907 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1908 * track what's online after any CPU or memory node hotplug or unplug event.
1909 *
1910 * Since there are two callers of this routine, one for CPU hotplug
1911 * events and one for memory node hotplug events, we could have coded
1912 * two separate routines here. We code it as a single common routine
1913 * in order to minimize text size.
1914 */
1915
1916static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1917{
1918 cgroup_lock();
1919
1920 top_cpuset.cpus_allowed = cpu_online_map;
1921 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1922 scan_for_empty_cpusets(&top_cpuset);
1923
1924 /*
1925 * Scheduler destroys domains on hotplug events.
1926 * Rebuild them based on the current settings.
1927 */
1928 if (rebuild_sd)
1929 rebuild_sched_domains();
1930
1931 cgroup_unlock();
1932}
1933
1934/*
1935 * The top_cpuset tracks what CPUs and Memory Nodes are online, 1968 * The top_cpuset tracks what CPUs and Memory Nodes are online,
1936 * period. This is necessary in order to make cpusets transparent 1969 * period. This is necessary in order to make cpusets transparent
1937 * (of no affect) on systems that are actively using CPU hotplug 1970 * (of no affect) on systems that are actively using CPU hotplug
@@ -1939,40 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1939 * 1972 *
1940 * This routine ensures that top_cpuset.cpus_allowed tracks 1973 * This routine ensures that top_cpuset.cpus_allowed tracks
1941 * cpu_online_map on each CPU hotplug (cpuhp) event. 1974 * cpu_online_map on each CPU hotplug (cpuhp) event.
1975 *
1976 * Called within get_online_cpus(). Needs to call cgroup_lock()
1977 * before calling generate_sched_domains().
1942 */ 1978 */
1943 1979static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1944static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1945 unsigned long phase, void *unused_cpu) 1980 unsigned long phase, void *unused_cpu)
1946{ 1981{
1982 struct sched_domain_attr *attr;
1983 cpumask_t *doms;
1984 int ndoms;
1985
1947 switch (phase) { 1986 switch (phase) {
1948 case CPU_UP_CANCELED:
1949 case CPU_UP_CANCELED_FROZEN:
1950 case CPU_DOWN_FAILED:
1951 case CPU_DOWN_FAILED_FROZEN:
1952 case CPU_ONLINE: 1987 case CPU_ONLINE:
1953 case CPU_ONLINE_FROZEN: 1988 case CPU_ONLINE_FROZEN:
1954 case CPU_DEAD: 1989 case CPU_DEAD:
1955 case CPU_DEAD_FROZEN: 1990 case CPU_DEAD_FROZEN:
1956 common_cpu_mem_hotplug_unplug(1);
1957 break; 1991 break;
1992
1958 default: 1993 default:
1959 return NOTIFY_DONE; 1994 return NOTIFY_DONE;
1960 } 1995 }
1961 1996
1997 cgroup_lock();
1998 top_cpuset.cpus_allowed = cpu_online_map;
1999 scan_for_empty_cpusets(&top_cpuset);
2000 ndoms = generate_sched_domains(&doms, &attr);
2001 cgroup_unlock();
2002
2003 /* Have scheduler rebuild the domains */
2004 partition_sched_domains(ndoms, doms, attr);
2005
1962 return NOTIFY_OK; 2006 return NOTIFY_OK;
1963} 2007}
1964 2008
1965#ifdef CONFIG_MEMORY_HOTPLUG 2009#ifdef CONFIG_MEMORY_HOTPLUG
1966/* 2010/*
1967 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2011 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
1968 * Call this routine anytime after you change 2012 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
1969 * node_states[N_HIGH_MEMORY]. 2013 * See also the previous routine cpuset_track_online_cpus().
1970 * See also the previous routine cpuset_handle_cpuhp().
1971 */ 2014 */
1972
1973void cpuset_track_online_nodes(void) 2015void cpuset_track_online_nodes(void)
1974{ 2016{
1975 common_cpu_mem_hotplug_unplug(0); 2017 cgroup_lock();
2018 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2019 scan_for_empty_cpusets(&top_cpuset);
2020 cgroup_unlock();
1976} 2021}
1977#endif 2022#endif
1978 2023
@@ -1987,7 +2032,7 @@ void __init cpuset_init_smp(void)
1987 top_cpuset.cpus_allowed = cpu_online_map; 2032 top_cpuset.cpus_allowed = cpu_online_map;
1988 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2033 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1989 2034
1990 hotcpu_notifier(cpuset_handle_cpuhp, 0); 2035 hotcpu_notifier(cpuset_track_online_cpus, 0);
1991} 2036}
1992 2037
1993/** 2038/**
diff --git a/kernel/exit.c b/kernel/exit.c
index 16395644a98f..85a83c831856 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
583 * If there are other users of the mm and the owner (us) is exiting 583 * If there are other users of the mm and the owner (us) is exiting
584 * we need to find a new owner to take on the responsibility. 584 * we need to find a new owner to take on the responsibility.
585 */ 585 */
586 if (!mm)
587 return 0;
588 if (atomic_read(&mm->mm_users) <= 1) 586 if (atomic_read(&mm->mm_users) <= 1)
589 return 0; 587 return 0;
590 if (mm->owner != p) 588 if (mm->owner != p)
@@ -627,6 +625,16 @@ retry:
627 } while_each_thread(g, c); 625 } while_each_thread(g, c);
628 626
629 read_unlock(&tasklist_lock); 627 read_unlock(&tasklist_lock);
628 /*
629 * We found no owner yet mm_users > 1: this implies that we are
630 * most likely racing with swapoff (try_to_unuse()) or /proc or
631 * ptrace or page migration (get_task_mm()). Mark owner as NULL,
632 * so that subsystems can understand the callback and take action.
633 */
634 down_write(&mm->mmap_sem);
635 cgroup_mm_owner_callbacks(mm->owner, NULL);
636 mm->owner = NULL;
637 up_write(&mm->mmap_sem);
630 return; 638 return;
631 639
632assign_new_owner: 640assign_new_owner:
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b8e4dce80a74..cdec83e722fa 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
672 */ 672 */
673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART); 673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
674 return 1; 674 return 1;
675 case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: 675 case HRTIMER_CB_IRQSAFE_PERCPU:
676 case HRTIMER_CB_IRQSAFE_UNLOCKED:
676 /* 677 /*
677 * This is solely for the sched tick emulation with 678 * This is solely for the sched tick emulation with
678 * dynamic tick support to ensure that we do not 679 * dynamic tick support to ensure that we do not
679 * restart the tick right on the edge and end up with 680 * restart the tick right on the edge and end up with
680 * the tick timer in the softirq ! The calling site 681 * the tick timer in the softirq ! The calling site
681 * takes care of this. 682 * takes care of this. Also used for hrtimer sleeper !
682 */ 683 */
683 debug_hrtimer_deactivate(timer); 684 debug_hrtimer_deactivate(timer);
684 return 1; 685 return 1;
@@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer)
1245 timer_stats_account_hrtimer(timer); 1246 timer_stats_account_hrtimer(timer);
1246 1247
1247 fn = timer->function; 1248 fn = timer->function;
1248 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { 1249 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1250 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
1249 /* 1251 /*
1250 * Used for scheduler timers, avoid lock inversion with 1252 * Used for scheduler timers, avoid lock inversion with
1251 * rq->lock and tasklist_lock. 1253 * rq->lock and tasklist_lock.
@@ -1452,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1452 sl->timer.function = hrtimer_wakeup; 1454 sl->timer.function = hrtimer_wakeup;
1453 sl->task = task; 1455 sl->task = task;
1454#ifdef CONFIG_HIGH_RES_TIMERS 1456#ifdef CONFIG_HIGH_RES_TIMERS
1455 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1457 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
1456#endif 1458#endif
1457} 1459}
1458 1460
@@ -1591,29 +1593,95 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1591 1593
1592#ifdef CONFIG_HOTPLUG_CPU 1594#ifdef CONFIG_HOTPLUG_CPU
1593 1595
1594static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1596static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1595 struct hrtimer_clock_base *new_base) 1597 struct hrtimer_clock_base *new_base, int dcpu)
1596{ 1598{
1597 struct hrtimer *timer; 1599 struct hrtimer *timer;
1598 struct rb_node *node; 1600 struct rb_node *node;
1601 int raise = 0;
1599 1602
1600 while ((node = rb_first(&old_base->active))) { 1603 while ((node = rb_first(&old_base->active))) {
1601 timer = rb_entry(node, struct hrtimer, node); 1604 timer = rb_entry(node, struct hrtimer, node);
1602 BUG_ON(hrtimer_callback_running(timer)); 1605 BUG_ON(hrtimer_callback_running(timer));
1603 debug_hrtimer_deactivate(timer); 1606 debug_hrtimer_deactivate(timer);
1604 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); 1607
1608 /*
1609 * Should not happen. Per CPU timers should be
1610 * canceled _before_ the migration code is called
1611 */
1612 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
1613 __remove_hrtimer(timer, old_base,
1614 HRTIMER_STATE_INACTIVE, 0);
1615 WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
1616 timer, timer->function, dcpu);
1617 continue;
1618 }
1619
1620 /*
1621 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1622 * timer could be seen as !active and just vanish away
1623 * under us on another CPU
1624 */
1625 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1605 timer->base = new_base; 1626 timer->base = new_base;
1606 /* 1627 /*
1607 * Enqueue the timer. Allow reprogramming of the event device 1628 * Enqueue the timer. Allow reprogramming of the event device
1608 */ 1629 */
1609 enqueue_hrtimer(timer, new_base, 1); 1630 enqueue_hrtimer(timer, new_base, 1);
1631
1632#ifdef CONFIG_HIGH_RES_TIMERS
1633 /*
1634 * Happens with high res enabled when the timer was
1635 * already expired and the callback mode is
1636 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
1637 * enqueue code does not move them to the soft irq
1638 * pending list for performance/latency reasons, but
1639 * in the migration state, we need to do that
1640 * otherwise we end up with a stale timer.
1641 */
1642 if (timer->state == HRTIMER_STATE_MIGRATE) {
1643 timer->state = HRTIMER_STATE_PENDING;
1644 list_add_tail(&timer->cb_entry,
1645 &new_base->cpu_base->cb_pending);
1646 raise = 1;
1647 }
1648#endif
1649 /* Clear the migration state bit */
1650 timer->state &= ~HRTIMER_STATE_MIGRATE;
1651 }
1652 return raise;
1653}
1654
1655#ifdef CONFIG_HIGH_RES_TIMERS
1656static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1657 struct hrtimer_cpu_base *new_base)
1658{
1659 struct hrtimer *timer;
1660 int raise = 0;
1661
1662 while (!list_empty(&old_base->cb_pending)) {
1663 timer = list_entry(old_base->cb_pending.next,
1664 struct hrtimer, cb_entry);
1665
1666 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
1667 timer->base = &new_base->clock_base[timer->base->index];
1668 list_add_tail(&timer->cb_entry, &new_base->cb_pending);
1669 raise = 1;
1610 } 1670 }
1671 return raise;
1672}
1673#else
1674static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1675 struct hrtimer_cpu_base *new_base)
1676{
1677 return 0;
1611} 1678}
1679#endif
1612 1680
1613static void migrate_hrtimers(int cpu) 1681static void migrate_hrtimers(int cpu)
1614{ 1682{
1615 struct hrtimer_cpu_base *old_base, *new_base; 1683 struct hrtimer_cpu_base *old_base, *new_base;
1616 int i; 1684 int i, raise = 0;
1617 1685
1618 BUG_ON(cpu_online(cpu)); 1686 BUG_ON(cpu_online(cpu));
1619 old_base = &per_cpu(hrtimer_bases, cpu); 1687 old_base = &per_cpu(hrtimer_bases, cpu);
@@ -1626,14 +1694,21 @@ static void migrate_hrtimers(int cpu)
1626 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1694 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1627 1695
1628 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1696 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1629 migrate_hrtimer_list(&old_base->clock_base[i], 1697 if (migrate_hrtimer_list(&old_base->clock_base[i],
1630 &new_base->clock_base[i]); 1698 &new_base->clock_base[i], cpu))
1699 raise = 1;
1631 } 1700 }
1632 1701
1702 if (migrate_hrtimer_pending(old_base, new_base))
1703 raise = 1;
1704
1633 spin_unlock(&old_base->lock); 1705 spin_unlock(&old_base->lock);
1634 spin_unlock(&new_base->lock); 1706 spin_unlock(&new_base->lock);
1635 local_irq_enable(); 1707 local_irq_enable();
1636 put_cpu_var(hrtimer_bases); 1708 put_cpu_var(hrtimer_bases);
1709
1710 if (raise)
1711 hrtimer_raise_softirq();
1637} 1712}
1638#endif /* CONFIG_HOTPLUG_CPU */ 1713#endif /* CONFIG_HOTPLUG_CPU */
1639 1714
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 59f3f0df35d4..aef265325cd3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image,
753 *old = addr | (*old & ~PAGE_MASK); 753 *old = addr | (*old & ~PAGE_MASK);
754 754
755 /* The old page I have found cannot be a 755 /* The old page I have found cannot be a
756 * destination page, so return it. 756 * destination page, so return it if it's
757 * gfp_flags honor the ones passed in.
757 */ 758 */
759 if (!(gfp_mask & __GFP_HIGHMEM) &&
760 PageHighMem(old_page)) {
761 kimage_free_pages(old_page);
762 continue;
763 }
758 addr = old_addr; 764 addr = old_addr;
759 page = old_page; 765 page = old_page;
760 break; 766 break;
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index eaa21fc9ad1d..e4dcfb2272a4 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -488,7 +488,7 @@ static int write_mem_msg(int binary)
488 if (err) 488 if (err)
489 return err; 489 return err;
490 if (CACHE_FLUSH_IS_SAFE) 490 if (CACHE_FLUSH_IS_SAFE)
491 flush_icache_range(addr, addr + length + 1); 491 flush_icache_range(addr, addr + length);
492 return 0; 492 return 0;
493 } 493 }
494 494
@@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs)
590 590
591 /* Signal the primary CPU that we are done: */ 591 /* Signal the primary CPU that we are done: */
592 atomic_set(&cpu_in_kgdb[cpu], 0); 592 atomic_set(&cpu_in_kgdb[cpu], 0);
593 touch_softlockup_watchdog();
593 clocksource_touch_watchdog(); 594 clocksource_touch_watchdog();
594 local_irq_restore(flags); 595 local_irq_restore(flags);
595} 596}
@@ -1432,6 +1433,7 @@ acquirelock:
1432 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
1433 1434
1434 atomic_set(&kgdb_active, -1); 1435 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog();
1435 clocksource_touch_watchdog(); 1437 clocksource_touch_watchdog();
1436 local_irq_restore(flags); 1438 local_irq_restore(flags);
1437 1439
@@ -1462,7 +1464,7 @@ acquirelock:
1462 * Get the passive CPU lock which will hold all the non-primary 1464 * Get the passive CPU lock which will hold all the non-primary
1463 * CPU in a spin state while the debugger is active 1465 * CPU in a spin state while the debugger is active
1464 */ 1466 */
1465 if (!kgdb_single_step || !kgdb_contthread) { 1467 if (!kgdb_single_step) {
1466 for (i = 0; i < NR_CPUS; i++) 1468 for (i = 0; i < NR_CPUS; i++)
1467 atomic_set(&passive_cpu_wait[i], 1); 1469 atomic_set(&passive_cpu_wait[i], 1);
1468 } 1470 }
@@ -1475,7 +1477,7 @@ acquirelock:
1475 1477
1476#ifdef CONFIG_SMP 1478#ifdef CONFIG_SMP
1477 /* Signal the other CPUs to enter kgdb_wait() */ 1479 /* Signal the other CPUs to enter kgdb_wait() */
1478 if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) 1480 if ((!kgdb_single_step) && kgdb_do_roundup)
1479 kgdb_roundup_cpus(flags); 1481 kgdb_roundup_cpus(flags);
1480#endif 1482#endif
1481 1483
@@ -1494,7 +1496,7 @@ acquirelock:
1494 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); 1496 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
1495 kgdb_deactivate_sw_breakpoints(); 1497 kgdb_deactivate_sw_breakpoints();
1496 kgdb_single_step = 0; 1498 kgdb_single_step = 0;
1497 kgdb_contthread = NULL; 1499 kgdb_contthread = current;
1498 exception_level = 0; 1500 exception_level = 0;
1499 1501
1500 /* Talk to debugger with gdbserial protocol */ 1502 /* Talk to debugger with gdbserial protocol */
@@ -1508,7 +1510,7 @@ acquirelock:
1508 kgdb_info[ks->cpu].task = NULL; 1510 kgdb_info[ks->cpu].task = NULL;
1509 atomic_set(&cpu_in_kgdb[ks->cpu], 0); 1511 atomic_set(&cpu_in_kgdb[ks->cpu], 0);
1510 1512
1511 if (!kgdb_single_step || !kgdb_contthread) { 1513 if (!kgdb_single_step) {
1512 for (i = NR_CPUS-1; i >= 0; i--) 1514 for (i = NR_CPUS-1; i >= 0; i--)
1513 atomic_set(&passive_cpu_wait[i], 0); 1515 atomic_set(&passive_cpu_wait[i], 0);
1514 /* 1516 /*
@@ -1524,6 +1526,7 @@ acquirelock:
1524kgdb_restore: 1526kgdb_restore:
1525 /* Free kgdb_active */ 1527 /* Free kgdb_active */
1526 atomic_set(&kgdb_active, -1); 1528 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog();
1527 clocksource_touch_watchdog(); 1530 clocksource_touch_watchdog();
1528 local_irq_restore(flags); 1531 local_irq_restore(flags);
1529 1532
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e36d5798cbff..5131e5471169 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -441,7 +441,7 @@ static struct k_itimer * alloc_posix_timer(void)
441 return tmr; 441 return tmr;
442 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { 442 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
443 kmem_cache_free(posix_timers_cache, tmr); 443 kmem_cache_free(posix_timers_cache, tmr);
444 tmr = NULL; 444 return NULL;
445 } 445 }
446 memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); 446 memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
447 return tmr; 447 return tmr;
diff --git a/kernel/sched.c b/kernel/sched.c
index 1a5f73c1fcdc..9715f4ce6cfe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
201 hrtimer_init(&rt_b->rt_period_timer, 201 hrtimer_init(&rt_b->rt_period_timer,
202 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 202 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
203 rt_b->rt_period_timer.function = sched_rt_period_timer; 203 rt_b->rt_period_timer.function = sched_rt_period_timer;
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205}
206
207static inline int rt_bandwidth_enabled(void)
208{
209 return sysctl_sched_rt_runtime >= 0;
205} 210}
206 211
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 212static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
208{ 213{
209 ktime_t now; 214 ktime_t now;
210 215
211 if (rt_b->rt_runtime == RUNTIME_INF) 216 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
212 return; 217 return;
213 218
214 if (hrtimer_active(&rt_b->rt_period_timer)) 219 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 303static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
300#endif /* CONFIG_RT_GROUP_SCHED */ 305#endif /* CONFIG_RT_GROUP_SCHED */
301#else /* !CONFIG_FAIR_GROUP_SCHED */ 306#else /* !CONFIG_USER_SCHED */
302#define root_task_group init_task_group 307#define root_task_group init_task_group
303#endif /* CONFIG_FAIR_GROUP_SCHED */ 308#endif /* CONFIG_USER_SCHED */
304 309
305/* task_group_lock serializes add/remove of task groups and also changes to 310/* task_group_lock serializes add/remove of task groups and also changes to
306 * a task group's cpu shares. 311 * a task group's cpu shares.
@@ -604,9 +609,9 @@ struct rq {
604 609
605static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 610static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
606 611
607static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 612static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
608{ 613{
609 rq->curr->sched_class->check_preempt_curr(rq, p); 614 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
610} 615}
611 616
612static inline int cpu_of(struct rq *rq) 617static inline int cpu_of(struct rq *rq)
@@ -1087,7 +1092,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1087 return NOTIFY_DONE; 1092 return NOTIFY_DONE;
1088} 1093}
1089 1094
1090static void init_hrtick(void) 1095static __init void init_hrtick(void)
1091{ 1096{
1092 hotcpu_notifier(hotplug_hrtick, 0); 1097 hotcpu_notifier(hotplug_hrtick, 0);
1093} 1098}
@@ -1102,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1102 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1107 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1103} 1108}
1104 1109
1105static void init_hrtick(void) 1110static inline void init_hrtick(void)
1106{ 1111{
1107} 1112}
1108#endif /* CONFIG_SMP */ 1113#endif /* CONFIG_SMP */
@@ -1119,9 +1124,9 @@ static void init_rq_hrtick(struct rq *rq)
1119 1124
1120 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1125 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1121 rq->hrtick_timer.function = hrtick; 1126 rq->hrtick_timer.function = hrtick;
1122 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1127 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1123} 1128}
1124#else 1129#else /* CONFIG_SCHED_HRTICK */
1125static inline void hrtick_clear(struct rq *rq) 1130static inline void hrtick_clear(struct rq *rq)
1126{ 1131{
1127} 1132}
@@ -1133,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq)
1133static inline void init_hrtick(void) 1138static inline void init_hrtick(void)
1134{ 1139{
1135} 1140}
1136#endif 1141#endif /* CONFIG_SCHED_HRTICK */
1137 1142
1138/* 1143/*
1139 * resched_task - mark a task 'to be rescheduled now'. 1144 * resched_task - mark a task 'to be rescheduled now'.
@@ -1380,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1380 update_load_sub(&rq->load, load); 1385 update_load_sub(&rq->load, load);
1381} 1386}
1382 1387
1383#ifdef CONFIG_SMP 1388#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1384static unsigned long source_load(int cpu, int type); 1389typedef int (*tg_visitor)(struct task_group *, void *);
1385static unsigned long target_load(int cpu, int type);
1386static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1387
1388static unsigned long cpu_avg_load_per_task(int cpu)
1389{
1390 struct rq *rq = cpu_rq(cpu);
1391
1392 if (rq->nr_running)
1393 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1394
1395 return rq->avg_load_per_task;
1396}
1397
1398#ifdef CONFIG_FAIR_GROUP_SCHED
1399
1400typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1401 1390
1402/* 1391/*
1403 * Iterate the full tree, calling @down when first entering a node and @up when 1392 * Iterate the full tree, calling @down when first entering a node and @up when
1404 * leaving it for the final time. 1393 * leaving it for the final time.
1405 */ 1394 */
1406static void 1395static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1407walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1408{ 1396{
1409 struct task_group *parent, *child; 1397 struct task_group *parent, *child;
1398 int ret;
1410 1399
1411 rcu_read_lock(); 1400 rcu_read_lock();
1412 parent = &root_task_group; 1401 parent = &root_task_group;
1413down: 1402down:
1414 (*down)(parent, cpu, sd); 1403 ret = (*down)(parent, data);
1404 if (ret)
1405 goto out_unlock;
1415 list_for_each_entry_rcu(child, &parent->children, siblings) { 1406 list_for_each_entry_rcu(child, &parent->children, siblings) {
1416 parent = child; 1407 parent = child;
1417 goto down; 1408 goto down;
@@ -1419,14 +1410,42 @@ down:
1419up: 1410up:
1420 continue; 1411 continue;
1421 } 1412 }
1422 (*up)(parent, cpu, sd); 1413 ret = (*up)(parent, data);
1414 if (ret)
1415 goto out_unlock;
1423 1416
1424 child = parent; 1417 child = parent;
1425 parent = parent->parent; 1418 parent = parent->parent;
1426 if (parent) 1419 if (parent)
1427 goto up; 1420 goto up;
1421out_unlock:
1428 rcu_read_unlock(); 1422 rcu_read_unlock();
1423
1424 return ret;
1425}
1426
1427static int tg_nop(struct task_group *tg, void *data)
1428{
1429 return 0;
1429} 1430}
1431#endif
1432
1433#ifdef CONFIG_SMP
1434static unsigned long source_load(int cpu, int type);
1435static unsigned long target_load(int cpu, int type);
1436static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1437
1438static unsigned long cpu_avg_load_per_task(int cpu)
1439{
1440 struct rq *rq = cpu_rq(cpu);
1441
1442 if (rq->nr_running)
1443 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1444
1445 return rq->avg_load_per_task;
1446}
1447
1448#ifdef CONFIG_FAIR_GROUP_SCHED
1430 1449
1431static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1450static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1432 1451
@@ -1486,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1486 * This needs to be done in a bottom-up fashion because the rq weight of a 1505 * This needs to be done in a bottom-up fashion because the rq weight of a
1487 * parent group depends on the shares of its child groups. 1506 * parent group depends on the shares of its child groups.
1488 */ 1507 */
1489static void 1508static int tg_shares_up(struct task_group *tg, void *data)
1490tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1491{ 1509{
1492 unsigned long rq_weight = 0; 1510 unsigned long rq_weight = 0;
1493 unsigned long shares = 0; 1511 unsigned long shares = 0;
1512 struct sched_domain *sd = data;
1494 int i; 1513 int i;
1495 1514
1496 for_each_cpu_mask(i, sd->span) { 1515 for_each_cpu_mask(i, sd->span) {
@@ -1515,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1515 __update_group_shares_cpu(tg, i, shares, rq_weight); 1534 __update_group_shares_cpu(tg, i, shares, rq_weight);
1516 spin_unlock_irqrestore(&rq->lock, flags); 1535 spin_unlock_irqrestore(&rq->lock, flags);
1517 } 1536 }
1537
1538 return 0;
1518} 1539}
1519 1540
1520/* 1541/*
@@ -1522,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1522 * This needs to be done in a top-down fashion because the load of a child 1543 * This needs to be done in a top-down fashion because the load of a child
1523 * group is a fraction of its parents load. 1544 * group is a fraction of its parents load.
1524 */ 1545 */
1525static void 1546static int tg_load_down(struct task_group *tg, void *data)
1526tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1527{ 1547{
1528 unsigned long load; 1548 unsigned long load;
1549 long cpu = (long)data;
1529 1550
1530 if (!tg->parent) { 1551 if (!tg->parent) {
1531 load = cpu_rq(cpu)->load.weight; 1552 load = cpu_rq(cpu)->load.weight;
@@ -1536,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1536 } 1557 }
1537 1558
1538 tg->cfs_rq[cpu]->h_load = load; 1559 tg->cfs_rq[cpu]->h_load = load;
1539}
1540 1560
1541static void 1561 return 0;
1542tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1543{
1544} 1562}
1545 1563
1546static void update_shares(struct sched_domain *sd) 1564static void update_shares(struct sched_domain *sd)
@@ -1550,7 +1568,7 @@ static void update_shares(struct sched_domain *sd)
1550 1568
1551 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1569 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1552 sd->last_update = now; 1570 sd->last_update = now;
1553 walk_tg_tree(tg_nop, tg_shares_up, 0, sd); 1571 walk_tg_tree(tg_nop, tg_shares_up, sd);
1554 } 1572 }
1555} 1573}
1556 1574
@@ -1561,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1561 spin_lock(&rq->lock); 1579 spin_lock(&rq->lock);
1562} 1580}
1563 1581
1564static void update_h_load(int cpu) 1582static void update_h_load(long cpu)
1565{ 1583{
1566 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); 1584 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1567} 1585}
1568 1586
1569#else 1587#else
@@ -1921,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1921 running = task_running(rq, p); 1939 running = task_running(rq, p);
1922 on_rq = p->se.on_rq; 1940 on_rq = p->se.on_rq;
1923 ncsw = 0; 1941 ncsw = 0;
1924 if (!match_state || p->state == match_state) { 1942 if (!match_state || p->state == match_state)
1925 ncsw = p->nivcsw + p->nvcsw; 1943 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1926 if (unlikely(!ncsw))
1927 ncsw = 1;
1928 }
1929 task_rq_unlock(rq, &flags); 1944 task_rq_unlock(rq, &flags);
1930 1945
1931 /* 1946 /*
@@ -2285,7 +2300,7 @@ out_running:
2285 trace_mark(kernel_sched_wakeup, 2300 trace_mark(kernel_sched_wakeup,
2286 "pid %d state %ld ## rq %p task %p rq->curr %p", 2301 "pid %d state %ld ## rq %p task %p rq->curr %p",
2287 p->pid, p->state, rq, p, rq->curr); 2302 p->pid, p->state, rq, p, rq->curr);
2288 check_preempt_curr(rq, p); 2303 check_preempt_curr(rq, p, sync);
2289 2304
2290 p->state = TASK_RUNNING; 2305 p->state = TASK_RUNNING;
2291#ifdef CONFIG_SMP 2306#ifdef CONFIG_SMP
@@ -2420,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2420 trace_mark(kernel_sched_wakeup_new, 2435 trace_mark(kernel_sched_wakeup_new,
2421 "pid %d state %ld ## rq %p task %p rq->curr %p", 2436 "pid %d state %ld ## rq %p task %p rq->curr %p",
2422 p->pid, p->state, rq, p, rq->curr); 2437 p->pid, p->state, rq, p, rq->curr);
2423 check_preempt_curr(rq, p); 2438 check_preempt_curr(rq, p, 0);
2424#ifdef CONFIG_SMP 2439#ifdef CONFIG_SMP
2425 if (p->sched_class->task_wake_up) 2440 if (p->sched_class->task_wake_up)
2426 p->sched_class->task_wake_up(rq, p); 2441 p->sched_class->task_wake_up(rq, p);
@@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2880 * Note that idle threads have a prio of MAX_PRIO, for this test 2895 * Note that idle threads have a prio of MAX_PRIO, for this test
2881 * to be always true for them. 2896 * to be always true for them.
2882 */ 2897 */
2883 check_preempt_curr(this_rq, p); 2898 check_preempt_curr(this_rq, p, 0);
2884} 2899}
2885 2900
2886/* 2901/*
@@ -4627,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4627} 4642}
4628EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4643EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4629 4644
4645/**
4646 * complete: - signals a single thread waiting on this completion
4647 * @x: holds the state of this particular completion
4648 *
4649 * This will wake up a single thread waiting on this completion. Threads will be
4650 * awakened in the same order in which they were queued.
4651 *
4652 * See also complete_all(), wait_for_completion() and related routines.
4653 */
4630void complete(struct completion *x) 4654void complete(struct completion *x)
4631{ 4655{
4632 unsigned long flags; 4656 unsigned long flags;
@@ -4638,6 +4662,12 @@ void complete(struct completion *x)
4638} 4662}
4639EXPORT_SYMBOL(complete); 4663EXPORT_SYMBOL(complete);
4640 4664
4665/**
4666 * complete_all: - signals all threads waiting on this completion
4667 * @x: holds the state of this particular completion
4668 *
4669 * This will wake up all threads waiting on this particular completion event.
4670 */
4641void complete_all(struct completion *x) 4671void complete_all(struct completion *x)
4642{ 4672{
4643 unsigned long flags; 4673 unsigned long flags;
@@ -4658,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4658 wait.flags |= WQ_FLAG_EXCLUSIVE; 4688 wait.flags |= WQ_FLAG_EXCLUSIVE;
4659 __add_wait_queue_tail(&x->wait, &wait); 4689 __add_wait_queue_tail(&x->wait, &wait);
4660 do { 4690 do {
4661 if ((state == TASK_INTERRUPTIBLE && 4691 if (signal_pending_state(state, current)) {
4662 signal_pending(current)) ||
4663 (state == TASK_KILLABLE &&
4664 fatal_signal_pending(current))) {
4665 timeout = -ERESTARTSYS; 4692 timeout = -ERESTARTSYS;
4666 break; 4693 break;
4667 } 4694 }
@@ -4689,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state)
4689 return timeout; 4716 return timeout;
4690} 4717}
4691 4718
4719/**
4720 * wait_for_completion: - waits for completion of a task
4721 * @x: holds the state of this particular completion
4722 *
4723 * This waits to be signaled for completion of a specific task. It is NOT
4724 * interruptible and there is no timeout.
4725 *
4726 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4727 * and interrupt capability. Also see complete().
4728 */
4692void __sched wait_for_completion(struct completion *x) 4729void __sched wait_for_completion(struct completion *x)
4693{ 4730{
4694 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4731 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4695} 4732}
4696EXPORT_SYMBOL(wait_for_completion); 4733EXPORT_SYMBOL(wait_for_completion);
4697 4734
4735/**
4736 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4737 * @x: holds the state of this particular completion
4738 * @timeout: timeout value in jiffies
4739 *
4740 * This waits for either a completion of a specific task to be signaled or for a
4741 * specified timeout to expire. The timeout is in jiffies. It is not
4742 * interruptible.
4743 */
4698unsigned long __sched 4744unsigned long __sched
4699wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4745wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4700{ 4746{
@@ -4702,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4702} 4748}
4703EXPORT_SYMBOL(wait_for_completion_timeout); 4749EXPORT_SYMBOL(wait_for_completion_timeout);
4704 4750
4751/**
4752 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4753 * @x: holds the state of this particular completion
4754 *
4755 * This waits for completion of a specific task to be signaled. It is
4756 * interruptible.
4757 */
4705int __sched wait_for_completion_interruptible(struct completion *x) 4758int __sched wait_for_completion_interruptible(struct completion *x)
4706{ 4759{
4707 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4760 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4711,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
4711} 4764}
4712EXPORT_SYMBOL(wait_for_completion_interruptible); 4765EXPORT_SYMBOL(wait_for_completion_interruptible);
4713 4766
4767/**
4768 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4769 * @x: holds the state of this particular completion
4770 * @timeout: timeout value in jiffies
4771 *
4772 * This waits for either a completion of a specific task to be signaled or for a
4773 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4774 */
4714unsigned long __sched 4775unsigned long __sched
4715wait_for_completion_interruptible_timeout(struct completion *x, 4776wait_for_completion_interruptible_timeout(struct completion *x,
4716 unsigned long timeout) 4777 unsigned long timeout)
@@ -4719,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
4719} 4780}
4720EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4781EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4721 4782
4783/**
4784 * wait_for_completion_killable: - waits for completion of a task (killable)
4785 * @x: holds the state of this particular completion
4786 *
4787 * This waits to be signaled for completion of a specific task. It can be
4788 * interrupted by a kill signal.
4789 */
4722int __sched wait_for_completion_killable(struct completion *x) 4790int __sched wait_for_completion_killable(struct completion *x)
4723{ 4791{
4724 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4792 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -5121,7 +5189,8 @@ recheck:
5121 * Do not allow realtime tasks into groups that have no runtime 5189 * Do not allow realtime tasks into groups that have no runtime
5122 * assigned. 5190 * assigned.
5123 */ 5191 */
5124 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5192 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5193 task_group(p)->rt_bandwidth.rt_runtime == 0)
5125 return -EPERM; 5194 return -EPERM;
5126#endif 5195#endif
5127 5196
@@ -5957,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5957 set_task_cpu(p, dest_cpu); 6026 set_task_cpu(p, dest_cpu);
5958 if (on_rq) { 6027 if (on_rq) {
5959 activate_task(rq_dest, p, 0); 6028 activate_task(rq_dest, p, 0);
5960 check_preempt_curr(rq_dest, p); 6029 check_preempt_curr(rq_dest, p, 0);
5961 } 6030 }
5962done: 6031done:
5963 ret = 1; 6032 ret = 1;
@@ -7696,24 +7765,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7696 * and partition_sched_domains() will fallback to the single partition 7765 * and partition_sched_domains() will fallback to the single partition
7697 * 'fallback_doms', it also forces the domains to be rebuilt. 7766 * 'fallback_doms', it also forces the domains to be rebuilt.
7698 * 7767 *
7768 * If doms_new==NULL it will be replaced with cpu_online_map.
7769 * ndoms_new==0 is a special case for destroying existing domains.
7770 * It will not create the default domain.
7771 *
7699 * Call with hotplug lock held 7772 * Call with hotplug lock held
7700 */ 7773 */
7701void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7774void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7702 struct sched_domain_attr *dattr_new) 7775 struct sched_domain_attr *dattr_new)
7703{ 7776{
7704 int i, j; 7777 int i, j, n;
7705 7778
7706 mutex_lock(&sched_domains_mutex); 7779 mutex_lock(&sched_domains_mutex);
7707 7780
7708 /* always unregister in case we don't destroy any domains */ 7781 /* always unregister in case we don't destroy any domains */
7709 unregister_sched_domain_sysctl(); 7782 unregister_sched_domain_sysctl();
7710 7783
7711 if (doms_new == NULL) 7784 n = doms_new ? ndoms_new : 0;
7712 ndoms_new = 0;
7713 7785
7714 /* Destroy deleted domains */ 7786 /* Destroy deleted domains */
7715 for (i = 0; i < ndoms_cur; i++) { 7787 for (i = 0; i < ndoms_cur; i++) {
7716 for (j = 0; j < ndoms_new; j++) { 7788 for (j = 0; j < n; j++) {
7717 if (cpus_equal(doms_cur[i], doms_new[j]) 7789 if (cpus_equal(doms_cur[i], doms_new[j])
7718 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7790 && dattrs_equal(dattr_cur, i, dattr_new, j))
7719 goto match1; 7791 goto match1;
@@ -7726,7 +7798,6 @@ match1:
7726 7798
7727 if (doms_new == NULL) { 7799 if (doms_new == NULL) {
7728 ndoms_cur = 0; 7800 ndoms_cur = 0;
7729 ndoms_new = 1;
7730 doms_new = &fallback_doms; 7801 doms_new = &fallback_doms;
7731 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7802 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7732 dattr_new = NULL; 7803 dattr_new = NULL;
@@ -7763,8 +7834,13 @@ match2:
7763int arch_reinit_sched_domains(void) 7834int arch_reinit_sched_domains(void)
7764{ 7835{
7765 get_online_cpus(); 7836 get_online_cpus();
7837
7838 /* Destroy domains first to force the rebuild */
7839 partition_sched_domains(0, NULL, NULL);
7840
7766 rebuild_sched_domains(); 7841 rebuild_sched_domains();
7767 put_online_cpus(); 7842 put_online_cpus();
7843
7768 return 0; 7844 return 0;
7769} 7845}
7770 7846
@@ -7848,7 +7924,7 @@ static int update_sched_domains(struct notifier_block *nfb,
7848 case CPU_ONLINE_FROZEN: 7924 case CPU_ONLINE_FROZEN:
7849 case CPU_DEAD: 7925 case CPU_DEAD:
7850 case CPU_DEAD_FROZEN: 7926 case CPU_DEAD_FROZEN:
7851 partition_sched_domains(0, NULL, NULL); 7927 partition_sched_domains(1, NULL, NULL);
7852 return NOTIFY_OK; 7928 return NOTIFY_OK;
7853 7929
7854 default: 7930 default:
@@ -8235,20 +8311,25 @@ void __might_sleep(char *file, int line)
8235#ifdef in_atomic 8311#ifdef in_atomic
8236 static unsigned long prev_jiffy; /* ratelimiting */ 8312 static unsigned long prev_jiffy; /* ratelimiting */
8237 8313
8238 if ((in_atomic() || irqs_disabled()) && 8314 if ((!in_atomic() && !irqs_disabled()) ||
8239 system_state == SYSTEM_RUNNING && !oops_in_progress) { 8315 system_state != SYSTEM_RUNNING || oops_in_progress)
8240 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8316 return;
8241 return; 8317 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8242 prev_jiffy = jiffies; 8318 return;
8243 printk(KERN_ERR "BUG: sleeping function called from invalid" 8319 prev_jiffy = jiffies;
8244 " context at %s:%d\n", file, line); 8320
8245 printk("in_atomic():%d, irqs_disabled():%d\n", 8321 printk(KERN_ERR
8246 in_atomic(), irqs_disabled()); 8322 "BUG: sleeping function called from invalid context at %s:%d\n",
8247 debug_show_held_locks(current); 8323 file, line);
8248 if (irqs_disabled()) 8324 printk(KERN_ERR
8249 print_irqtrace_events(current); 8325 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8250 dump_stack(); 8326 in_atomic(), irqs_disabled(),
8251 } 8327 current->pid, current->comm);
8328
8329 debug_show_held_locks(current);
8330 if (irqs_disabled())
8331 print_irqtrace_events(current);
8332 dump_stack();
8252#endif 8333#endif
8253} 8334}
8254EXPORT_SYMBOL(__might_sleep); 8335EXPORT_SYMBOL(__might_sleep);
@@ -8746,73 +8827,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
8746static unsigned long to_ratio(u64 period, u64 runtime) 8827static unsigned long to_ratio(u64 period, u64 runtime)
8747{ 8828{
8748 if (runtime == RUNTIME_INF) 8829 if (runtime == RUNTIME_INF)
8749 return 1ULL << 16; 8830 return 1ULL << 20;
8750 8831
8751 return div64_u64(runtime << 16, period); 8832 return div64_u64(runtime << 20, period);
8752} 8833}
8753 8834
8754#ifdef CONFIG_CGROUP_SCHED 8835/* Must be called with tasklist_lock held */
8755static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8836static inline int tg_has_rt_tasks(struct task_group *tg)
8756{ 8837{
8757 struct task_group *tgi, *parent = tg->parent; 8838 struct task_struct *g, *p;
8758 unsigned long total = 0;
8759 8839
8760 if (!parent) { 8840 do_each_thread(g, p) {
8761 if (global_rt_period() < period) 8841 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8762 return 0; 8842 return 1;
8843 } while_each_thread(g, p);
8763 8844
8764 return to_ratio(period, runtime) < 8845 return 0;
8765 to_ratio(global_rt_period(), global_rt_runtime()); 8846}
8766 }
8767 8847
8768 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) 8848struct rt_schedulable_data {
8769 return 0; 8849 struct task_group *tg;
8850 u64 rt_period;
8851 u64 rt_runtime;
8852};
8770 8853
8771 rcu_read_lock(); 8854static int tg_schedulable(struct task_group *tg, void *data)
8772 list_for_each_entry_rcu(tgi, &parent->children, siblings) { 8855{
8773 if (tgi == tg) 8856 struct rt_schedulable_data *d = data;
8774 continue; 8857 struct task_group *child;
8858 unsigned long total, sum = 0;
8859 u64 period, runtime;
8860
8861 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8862 runtime = tg->rt_bandwidth.rt_runtime;
8775 8863
8776 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8864 if (tg == d->tg) {
8777 tgi->rt_bandwidth.rt_runtime); 8865 period = d->rt_period;
8866 runtime = d->rt_runtime;
8778 } 8867 }
8779 rcu_read_unlock();
8780 8868
8781 return total + to_ratio(period, runtime) <= 8869 /*
8782 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8870 * Cannot have more runtime than the period.
8783 parent->rt_bandwidth.rt_runtime); 8871 */
8784} 8872 if (runtime > period && runtime != RUNTIME_INF)
8785#elif defined CONFIG_USER_SCHED 8873 return -EINVAL;
8786static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8787{
8788 struct task_group *tgi;
8789 unsigned long total = 0;
8790 unsigned long global_ratio =
8791 to_ratio(global_rt_period(), global_rt_runtime());
8792 8874
8793 rcu_read_lock(); 8875 /*
8794 list_for_each_entry_rcu(tgi, &task_groups, list) { 8876 * Ensure we don't starve existing RT tasks.
8795 if (tgi == tg) 8877 */
8796 continue; 8878 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8879 return -EBUSY;
8880
8881 total = to_ratio(period, runtime);
8797 8882
8798 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8883 /*
8799 tgi->rt_bandwidth.rt_runtime); 8884 * Nobody can have more than the global setting allows.
8885 */
8886 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8887 return -EINVAL;
8888
8889 /*
8890 * The sum of our children's runtime should not exceed our own.
8891 */
8892 list_for_each_entry_rcu(child, &tg->children, siblings) {
8893 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8894 runtime = child->rt_bandwidth.rt_runtime;
8895
8896 if (child == d->tg) {
8897 period = d->rt_period;
8898 runtime = d->rt_runtime;
8899 }
8900
8901 sum += to_ratio(period, runtime);
8800 } 8902 }
8801 rcu_read_unlock();
8802 8903
8803 return total + to_ratio(period, runtime) < global_ratio; 8904 if (sum > total)
8905 return -EINVAL;
8906
8907 return 0;
8804} 8908}
8805#endif
8806 8909
8807/* Must be called with tasklist_lock held */ 8910static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8808static inline int tg_has_rt_tasks(struct task_group *tg)
8809{ 8911{
8810 struct task_struct *g, *p; 8912 struct rt_schedulable_data data = {
8811 do_each_thread(g, p) { 8913 .tg = tg,
8812 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8914 .rt_period = period,
8813 return 1; 8915 .rt_runtime = runtime,
8814 } while_each_thread(g, p); 8916 };
8815 return 0; 8917
8918 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8816} 8919}
8817 8920
8818static int tg_set_bandwidth(struct task_group *tg, 8921static int tg_set_bandwidth(struct task_group *tg,
@@ -8822,14 +8925,9 @@ static int tg_set_bandwidth(struct task_group *tg,
8822 8925
8823 mutex_lock(&rt_constraints_mutex); 8926 mutex_lock(&rt_constraints_mutex);
8824 read_lock(&tasklist_lock); 8927 read_lock(&tasklist_lock);
8825 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { 8928 err = __rt_schedulable(tg, rt_period, rt_runtime);
8826 err = -EBUSY; 8929 if (err)
8827 goto unlock;
8828 }
8829 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8830 err = -EINVAL;
8831 goto unlock; 8930 goto unlock;
8832 }
8833 8931
8834 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8932 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8835 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8933 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8898,16 +8996,25 @@ long sched_group_rt_period(struct task_group *tg)
8898 8996
8899static int sched_rt_global_constraints(void) 8997static int sched_rt_global_constraints(void)
8900{ 8998{
8901 struct task_group *tg = &root_task_group; 8999 u64 runtime, period;
8902 u64 rt_runtime, rt_period;
8903 int ret = 0; 9000 int ret = 0;
8904 9001
8905 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 9002 if (sysctl_sched_rt_period <= 0)
8906 rt_runtime = tg->rt_bandwidth.rt_runtime; 9003 return -EINVAL;
9004
9005 runtime = global_rt_runtime();
9006 period = global_rt_period();
9007
9008 /*
9009 * Sanity check on the sysctl variables.
9010 */
9011 if (runtime > period && runtime != RUNTIME_INF)
9012 return -EINVAL;
8907 9013
8908 mutex_lock(&rt_constraints_mutex); 9014 mutex_lock(&rt_constraints_mutex);
8909 if (!__rt_schedulable(tg, rt_period, rt_runtime)) 9015 read_lock(&tasklist_lock);
8910 ret = -EINVAL; 9016 ret = __rt_schedulable(NULL, 0, 0);
9017 read_unlock(&tasklist_lock);
8911 mutex_unlock(&rt_constraints_mutex); 9018 mutex_unlock(&rt_constraints_mutex);
8912 9019
8913 return ret; 9020 return ret;
@@ -8918,6 +9025,9 @@ static int sched_rt_global_constraints(void)
8918 unsigned long flags; 9025 unsigned long flags;
8919 int i; 9026 int i;
8920 9027
9028 if (sysctl_sched_rt_period <= 0)
9029 return -EINVAL;
9030
8921 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 9031 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8922 for_each_possible_cpu(i) { 9032 for_each_possible_cpu(i) {
8923 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 9033 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -8978,7 +9088,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8978 9088
8979 if (!cgrp->parent) { 9089 if (!cgrp->parent) {
8980 /* This is early initialization for the top cgroup */ 9090 /* This is early initialization for the top cgroup */
8981 init_task_group.css.cgroup = cgrp;
8982 return &init_task_group.css; 9091 return &init_task_group.css;
8983 } 9092 }
8984 9093
@@ -8987,9 +9096,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8987 if (IS_ERR(tg)) 9096 if (IS_ERR(tg))
8988 return ERR_PTR(-ENOMEM); 9097 return ERR_PTR(-ENOMEM);
8989 9098
8990 /* Bind the cgroup to task_group object we just created */
8991 tg->css.cgroup = cgrp;
8992
8993 return &tg->css; 9099 return &tg->css;
8994} 9100}
8995 9101
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb8994c6d4bb..fcbe850a5a90 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
409} 409}
410 410
411/* 411/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 *
421 */
422static unsigned long
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
424{
425 struct load_weight lw = {
426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
429
430 for_each_sched_entity(se) {
431 struct load_weight *se_lw = &se->load;
432 unsigned long rw = cfs_rq_of(se)->load.weight;
433
434#ifdef CONFIG_FAIR_SCHED_GROUP
435 struct cfs_rq *cfs_rq = se->my_q;
436 struct task_group *tg = NULL
437
438 if (cfs_rq)
439 tg = cfs_rq->tg;
440
441 if (tg && tg->shares < NICE_0_LOAD) {
442 /*
443 * scale shares to what it would have been had
444 * tg->weight been NICE_0_LOAD:
445 *
446 * weight = 1024 * shares / tg->weight
447 */
448 lw.weight *= se->load.weight;
449 lw.weight /= tg->shares;
450
451 lw.inv_weight = 0;
452
453 se_lw = &lw;
454 rw += lw.weight - se->load.weight;
455 } else
456#endif
457
458 if (se->load.weight < NICE_0_LOAD) {
459 se_lw = &lw;
460 rw += NICE_0_LOAD - se->load.weight;
461 }
462
463 delta = calc_delta_mine(delta, rw, se_lw);
464 }
465
466 return delta;
467}
468
469/*
470 * Update the current task's runtime statistics. Skip current tasks that 412 * Update the current task's runtime statistics. Skip current tasks that
471 * are not in our scheduling class. 413 * are not in our scheduling class.
472 */ 414 */
@@ -586,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
586 update_load_add(&cfs_rq->load, se->load.weight); 528 update_load_add(&cfs_rq->load, se->load.weight);
587 if (!parent_entity(se)) 529 if (!parent_entity(se))
588 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 530 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
589 if (entity_is_task(se)) 531 if (entity_is_task(se)) {
590 add_cfs_task_weight(cfs_rq, se->load.weight); 532 add_cfs_task_weight(cfs_rq, se->load.weight);
533 list_add(&se->group_node, &cfs_rq->tasks);
534 }
591 cfs_rq->nr_running++; 535 cfs_rq->nr_running++;
592 se->on_rq = 1; 536 se->on_rq = 1;
593 list_add(&se->group_node, &cfs_rq->tasks);
594} 537}
595 538
596static void 539static void
@@ -599,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
599 update_load_sub(&cfs_rq->load, se->load.weight); 542 update_load_sub(&cfs_rq->load, se->load.weight);
600 if (!parent_entity(se)) 543 if (!parent_entity(se))
601 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 544 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
602 if (entity_is_task(se)) 545 if (entity_is_task(se)) {
603 add_cfs_task_weight(cfs_rq, -se->load.weight); 546 add_cfs_task_weight(cfs_rq, -se->load.weight);
547 list_del_init(&se->group_node);
548 }
604 cfs_rq->nr_running--; 549 cfs_rq->nr_running--;
605 se->on_rq = 0; 550 se->on_rq = 0;
606 list_del_init(&se->group_node);
607} 551}
608 552
609static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 553static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1085,7 +1029,6 @@ static long effective_load(struct task_group *tg, int cpu,
1085 long wl, long wg) 1029 long wl, long wg)
1086{ 1030{
1087 struct sched_entity *se = tg->se[cpu]; 1031 struct sched_entity *se = tg->se[cpu];
1088 long more_w;
1089 1032
1090 if (!tg->parent) 1033 if (!tg->parent)
1091 return wl; 1034 return wl;
@@ -1097,18 +1040,17 @@ static long effective_load(struct task_group *tg, int cpu,
1097 if (!wl && sched_feat(ASYM_EFF_LOAD)) 1040 if (!wl && sched_feat(ASYM_EFF_LOAD))
1098 return wl; 1041 return wl;
1099 1042
1100 /*
1101 * Instead of using this increment, also add the difference
1102 * between when the shares were last updated and now.
1103 */
1104 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1105 wl += more_w;
1106 wg += more_w;
1107
1108 for_each_sched_entity(se) { 1043 for_each_sched_entity(se) {
1109#define D(n) (likely(n) ? (n) : 1)
1110
1111 long S, rw, s, a, b; 1044 long S, rw, s, a, b;
1045 long more_w;
1046
1047 /*
1048 * Instead of using this increment, also add the difference
1049 * between when the shares were last updated and now.
1050 */
1051 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1052 wl += more_w;
1053 wg += more_w;
1112 1054
1113 S = se->my_q->tg->shares; 1055 S = se->my_q->tg->shares;
1114 s = se->my_q->shares; 1056 s = se->my_q->shares;
@@ -1117,7 +1059,11 @@ static long effective_load(struct task_group *tg, int cpu,
1117 a = S*(rw + wl); 1059 a = S*(rw + wl);
1118 b = S*rw + s*wg; 1060 b = S*rw + s*wg;
1119 1061
1120 wl = s*(a-b)/D(b); 1062 wl = s*(a-b);
1063
1064 if (likely(b))
1065 wl /= b;
1066
1121 /* 1067 /*
1122 * Assume the group is already running and will 1068 * Assume the group is already running and will
1123 * thus already be accounted for in the weight. 1069 * thus already be accounted for in the weight.
@@ -1126,7 +1072,6 @@ static long effective_load(struct task_group *tg, int cpu,
1126 * alter the group weight. 1072 * alter the group weight.
1127 */ 1073 */
1128 wg = 0; 1074 wg = 0;
1129#undef D
1130 } 1075 }
1131 1076
1132 return wl; 1077 return wl;
@@ -1143,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1143#endif 1088#endif
1144 1089
1145static int 1090static int
1146wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, 1091wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1147 struct task_struct *p, int prev_cpu, int this_cpu, int sync, 1092 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1148 int idx, unsigned long load, unsigned long this_load, 1093 int idx, unsigned long load, unsigned long this_load,
1149 unsigned int imbalance) 1094 unsigned int imbalance)
@@ -1191,8 +1136,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1191 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1136 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1192 tl_per_task = cpu_avg_load_per_task(this_cpu); 1137 tl_per_task = cpu_avg_load_per_task(this_cpu);
1193 1138
1194 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || 1139 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
1195 balanced) { 1140 tl_per_task)) {
1196 /* 1141 /*
1197 * This domain has SD_WAKE_AFFINE and 1142 * This domain has SD_WAKE_AFFINE and
1198 * p is cache cold in this domain, and 1143 * p is cache cold in this domain, and
@@ -1211,16 +1156,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1211 struct sched_domain *sd, *this_sd = NULL; 1156 struct sched_domain *sd, *this_sd = NULL;
1212 int prev_cpu, this_cpu, new_cpu; 1157 int prev_cpu, this_cpu, new_cpu;
1213 unsigned long load, this_load; 1158 unsigned long load, this_load;
1214 struct rq *rq, *this_rq; 1159 struct rq *this_rq;
1215 unsigned int imbalance; 1160 unsigned int imbalance;
1216 int idx; 1161 int idx;
1217 1162
1218 prev_cpu = task_cpu(p); 1163 prev_cpu = task_cpu(p);
1219 rq = task_rq(p);
1220 this_cpu = smp_processor_id(); 1164 this_cpu = smp_processor_id();
1221 this_rq = cpu_rq(this_cpu); 1165 this_rq = cpu_rq(this_cpu);
1222 new_cpu = prev_cpu; 1166 new_cpu = prev_cpu;
1223 1167
1168 if (prev_cpu == this_cpu)
1169 goto out;
1224 /* 1170 /*
1225 * 'this_sd' is the first domain that both 1171 * 'this_sd' is the first domain that both
1226 * this_cpu and prev_cpu are present in: 1172 * this_cpu and prev_cpu are present in:
@@ -1248,13 +1194,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1248 load = source_load(prev_cpu, idx); 1194 load = source_load(prev_cpu, idx);
1249 this_load = target_load(this_cpu, idx); 1195 this_load = target_load(this_cpu, idx);
1250 1196
1251 if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1197 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1252 load, this_load, imbalance)) 1198 load, this_load, imbalance))
1253 return this_cpu; 1199 return this_cpu;
1254 1200
1255 if (prev_cpu == this_cpu)
1256 goto out;
1257
1258 /* 1201 /*
1259 * Start passive balancing when half the imbalance_pct 1202 * Start passive balancing when half the imbalance_pct
1260 * limit is reached. 1203 * limit is reached.
@@ -1281,62 +1224,20 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1281 * + nice tasks. 1224 * + nice tasks.
1282 */ 1225 */
1283 if (sched_feat(ASYM_GRAN)) 1226 if (sched_feat(ASYM_GRAN))
1284 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); 1227 gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
1285 else
1286 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1287 1228
1288 return gran; 1229 return gran;
1289} 1230}
1290 1231
1291/* 1232/*
1292 * Should 'se' preempt 'curr'.
1293 *
1294 * |s1
1295 * |s2
1296 * |s3
1297 * g
1298 * |<--->|c
1299 *
1300 * w(c, s1) = -1
1301 * w(c, s2) = 0
1302 * w(c, s3) = 1
1303 *
1304 */
1305static int
1306wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1307{
1308 s64 gran, vdiff = curr->vruntime - se->vruntime;
1309
1310 if (vdiff < 0)
1311 return -1;
1312
1313 gran = wakeup_gran(curr);
1314 if (vdiff > gran)
1315 return 1;
1316
1317 return 0;
1318}
1319
1320/* return depth at which a sched entity is present in the hierarchy */
1321static inline int depth_se(struct sched_entity *se)
1322{
1323 int depth = 0;
1324
1325 for_each_sched_entity(se)
1326 depth++;
1327
1328 return depth;
1329}
1330
1331/*
1332 * Preempt the current task with a newly woken task if needed: 1233 * Preempt the current task with a newly woken task if needed:
1333 */ 1234 */
1334static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1235static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1335{ 1236{
1336 struct task_struct *curr = rq->curr; 1237 struct task_struct *curr = rq->curr;
1337 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1238 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1338 struct sched_entity *se = &curr->se, *pse = &p->se; 1239 struct sched_entity *se = &curr->se, *pse = &p->se;
1339 int se_depth, pse_depth; 1240 s64 delta_exec;
1340 1241
1341 if (unlikely(rt_prio(p->prio))) { 1242 if (unlikely(rt_prio(p->prio))) {
1342 update_rq_clock(rq); 1243 update_rq_clock(rq);
@@ -1351,6 +1252,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1351 cfs_rq_of(pse)->next = pse; 1252 cfs_rq_of(pse)->next = pse;
1352 1253
1353 /* 1254 /*
1255 * We can come here with TIF_NEED_RESCHED already set from new task
1256 * wake up path.
1257 */
1258 if (test_tsk_need_resched(curr))
1259 return;
1260
1261 /*
1354 * Batch tasks do not preempt (their preemption is driven by 1262 * Batch tasks do not preempt (their preemption is driven by
1355 * the tick): 1263 * the tick):
1356 */ 1264 */
@@ -1360,33 +1268,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1360 if (!sched_feat(WAKEUP_PREEMPT)) 1268 if (!sched_feat(WAKEUP_PREEMPT))
1361 return; 1269 return;
1362 1270
1363 /* 1271 if (sched_feat(WAKEUP_OVERLAP) && sync &&
1364 * preemption test can be made between sibling entities who are in the 1272 se->avg_overlap < sysctl_sched_migration_cost &&
1365 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of 1273 pse->avg_overlap < sysctl_sched_migration_cost) {
1366 * both tasks until we find their ancestors who are siblings of common 1274 resched_task(curr);
1367 * parent. 1275 return;
1368 */
1369
1370 /* First walk up until both entities are at same depth */
1371 se_depth = depth_se(se);
1372 pse_depth = depth_se(pse);
1373
1374 while (se_depth > pse_depth) {
1375 se_depth--;
1376 se = parent_entity(se);
1377 }
1378
1379 while (pse_depth > se_depth) {
1380 pse_depth--;
1381 pse = parent_entity(pse);
1382 }
1383
1384 while (!is_same_group(se, pse)) {
1385 se = parent_entity(se);
1386 pse = parent_entity(pse);
1387 } 1276 }
1388 1277
1389 if (wakeup_preempt_entity(se, pse) == 1) 1278 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1279 if (delta_exec > wakeup_gran(pse))
1390 resched_task(curr); 1280 resched_task(curr);
1391} 1281}
1392 1282
@@ -1445,19 +1335,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1445 if (next == &cfs_rq->tasks) 1335 if (next == &cfs_rq->tasks)
1446 return NULL; 1336 return NULL;
1447 1337
1448 /* Skip over entities that are not tasks */ 1338 se = list_entry(next, struct sched_entity, group_node);
1449 do { 1339 p = task_of(se);
1450 se = list_entry(next, struct sched_entity, group_node); 1340 cfs_rq->balance_iterator = next->next;
1451 next = next->next;
1452 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1453
1454 if (next == &cfs_rq->tasks)
1455 return NULL;
1456
1457 cfs_rq->balance_iterator = next;
1458
1459 if (entity_is_task(se))
1460 p = task_of(se);
1461 1341
1462 return p; 1342 return p;
1463} 1343}
@@ -1507,7 +1387,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1507 rcu_read_lock(); 1387 rcu_read_lock();
1508 update_h_load(busiest_cpu); 1388 update_h_load(busiest_cpu);
1509 1389
1510 list_for_each_entry(tg, &task_groups, list) { 1390 list_for_each_entry_rcu(tg, &task_groups, list) {
1511 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; 1391 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1512 unsigned long busiest_h_load = busiest_cfs_rq->h_load; 1392 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
1513 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 1393 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@ -1620,10 +1500,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1620 * 'current' within the tree based on its new key value. 1500 * 'current' within the tree based on its new key value.
1621 */ 1501 */
1622 swap(curr->vruntime, se->vruntime); 1502 swap(curr->vruntime, se->vruntime);
1503 resched_task(rq->curr);
1623 } 1504 }
1624 1505
1625 enqueue_task_fair(rq, p, 0); 1506 enqueue_task_fair(rq, p, 0);
1626 resched_task(rq->curr);
1627} 1507}
1628 1508
1629/* 1509/*
@@ -1642,7 +1522,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1642 if (p->prio > oldprio) 1522 if (p->prio > oldprio)
1643 resched_task(rq->curr); 1523 resched_task(rq->curr);
1644 } else 1524 } else
1645 check_preempt_curr(rq, p); 1525 check_preempt_curr(rq, p, 0);
1646} 1526}
1647 1527
1648/* 1528/*
@@ -1659,7 +1539,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
1659 if (running) 1539 if (running)
1660 resched_task(rq->curr); 1540 resched_task(rq->curr);
1661 else 1541 else
1662 check_preempt_curr(rq, p); 1542 check_preempt_curr(rq, p, 0);
1663} 1543}
1664 1544
1665/* Account for a task changing its policy or group. 1545/* Account for a task changing its policy or group.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 9353ca78154e..7c9e8f4a049f 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1)
11SCHED_FEAT(LB_BIAS, 1) 11SCHED_FEAT(LB_BIAS, 1)
12SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 12SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1) 13SCHED_FEAT(ASYM_EFF_LOAD, 1)
14SCHED_FEAT(WAKEUP_OVERLAP, 0)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3a4f92dbbe66..dec4ccabe2f5 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
76 if (running) 76 if (running)
77 resched_task(rq->curr); 77 resched_task(rq->curr);
78 else 78 else
79 check_preempt_curr(rq, p); 79 check_preempt_curr(rq, p, 0);
80} 80}
81 81
82static void prio_changed_idle(struct rq *rq, struct task_struct *p, 82static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
93 if (p->prio > oldprio) 93 if (p->prio > oldprio)
94 resched_task(rq->curr); 94 resched_task(rq->curr);
95 } else 95 } else
96 check_preempt_curr(rq, p); 96 check_preempt_curr(rq, p, 0);
97} 97}
98 98
99/* 99/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 552310798dad..cdf5740ab03e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
102 102
103static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 103static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
104{ 104{
105 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
105 struct sched_rt_entity *rt_se = rt_rq->rt_se; 106 struct sched_rt_entity *rt_se = rt_rq->rt_se;
106 107
107 if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { 108 if (rt_rq->rt_nr_running) {
108 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 109 if (rt_se && !on_rt_rq(rt_se))
109 110 enqueue_rt_entity(rt_se);
110 enqueue_rt_entity(rt_se);
111 if (rt_rq->highest_prio < curr->prio) 111 if (rt_rq->highest_prio < curr->prio)
112 resched_task(curr); 112 resched_task(curr);
113 } 113 }
@@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
231#endif /* CONFIG_RT_GROUP_SCHED */ 231#endif /* CONFIG_RT_GROUP_SCHED */
232 232
233#ifdef CONFIG_SMP 233#ifdef CONFIG_SMP
234/*
235 * We ran out of runtime, see if we can borrow some from our neighbours.
236 */
234static int do_balance_runtime(struct rt_rq *rt_rq) 237static int do_balance_runtime(struct rt_rq *rt_rq)
235{ 238{
236 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 239 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
250 continue; 253 continue;
251 254
252 spin_lock(&iter->rt_runtime_lock); 255 spin_lock(&iter->rt_runtime_lock);
256 /*
257 * Either all rqs have inf runtime and there's nothing to steal
258 * or __disable_runtime() below sets a specific rq to inf to
259 * indicate its been disabled and disalow stealing.
260 */
253 if (iter->rt_runtime == RUNTIME_INF) 261 if (iter->rt_runtime == RUNTIME_INF)
254 goto next; 262 goto next;
255 263
264 /*
265 * From runqueues with spare time, take 1/n part of their
266 * spare time, but no more than our period.
267 */
256 diff = iter->rt_runtime - iter->rt_time; 268 diff = iter->rt_runtime - iter->rt_time;
257 if (diff > 0) { 269 if (diff > 0) {
258 diff = div_u64((u64)diff, weight); 270 diff = div_u64((u64)diff, weight);
@@ -274,6 +286,9 @@ next:
274 return more; 286 return more;
275} 287}
276 288
289/*
290 * Ensure this RQ takes back all the runtime it lend to its neighbours.
291 */
277static void __disable_runtime(struct rq *rq) 292static void __disable_runtime(struct rq *rq)
278{ 293{
279 struct root_domain *rd = rq->rd; 294 struct root_domain *rd = rq->rd;
@@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq)
289 304
290 spin_lock(&rt_b->rt_runtime_lock); 305 spin_lock(&rt_b->rt_runtime_lock);
291 spin_lock(&rt_rq->rt_runtime_lock); 306 spin_lock(&rt_rq->rt_runtime_lock);
307 /*
308 * Either we're all inf and nobody needs to borrow, or we're
309 * already disabled and thus have nothing to do, or we have
310 * exactly the right amount of runtime to take out.
311 */
292 if (rt_rq->rt_runtime == RUNTIME_INF || 312 if (rt_rq->rt_runtime == RUNTIME_INF ||
293 rt_rq->rt_runtime == rt_b->rt_runtime) 313 rt_rq->rt_runtime == rt_b->rt_runtime)
294 goto balanced; 314 goto balanced;
295 spin_unlock(&rt_rq->rt_runtime_lock); 315 spin_unlock(&rt_rq->rt_runtime_lock);
296 316
317 /*
318 * Calculate the difference between what we started out with
319 * and what we current have, that's the amount of runtime
320 * we lend and now have to reclaim.
321 */
297 want = rt_b->rt_runtime - rt_rq->rt_runtime; 322 want = rt_b->rt_runtime - rt_rq->rt_runtime;
298 323
324 /*
325 * Greedy reclaim, take back as much as we can.
326 */
299 for_each_cpu_mask(i, rd->span) { 327 for_each_cpu_mask(i, rd->span) {
300 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
301 s64 diff; 329 s64 diff;
302 330
331 /*
332 * Can't reclaim from ourselves or disabled runqueues.
333 */
303 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 334 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
304 continue; 335 continue;
305 336
@@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
319 } 350 }
320 351
321 spin_lock(&rt_rq->rt_runtime_lock); 352 spin_lock(&rt_rq->rt_runtime_lock);
353 /*
354 * We cannot be left wanting - that would mean some runtime
355 * leaked out of the system.
356 */
322 BUG_ON(want); 357 BUG_ON(want);
323balanced: 358balanced:
359 /*
360 * Disable all the borrow logic by pretending we have inf
361 * runtime - in which case borrowing doesn't make sense.
362 */
324 rt_rq->rt_runtime = RUNTIME_INF; 363 rt_rq->rt_runtime = RUNTIME_INF;
325 spin_unlock(&rt_rq->rt_runtime_lock); 364 spin_unlock(&rt_rq->rt_runtime_lock);
326 spin_unlock(&rt_b->rt_runtime_lock); 365 spin_unlock(&rt_b->rt_runtime_lock);
@@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
343 if (unlikely(!scheduler_running)) 382 if (unlikely(!scheduler_running))
344 return; 383 return;
345 384
385 /*
386 * Reset each runqueue's bandwidth settings
387 */
346 for_each_leaf_rt_rq(rt_rq, rq) { 388 for_each_leaf_rt_rq(rt_rq, rq) {
347 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 389 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
348 390
@@ -350,6 +392,7 @@ static void __enable_runtime(struct rq *rq)
350 spin_lock(&rt_rq->rt_runtime_lock); 392 spin_lock(&rt_rq->rt_runtime_lock);
351 rt_rq->rt_runtime = rt_b->rt_runtime; 393 rt_rq->rt_runtime = rt_b->rt_runtime;
352 rt_rq->rt_time = 0; 394 rt_rq->rt_time = 0;
395 rt_rq->rt_throttled = 0;
353 spin_unlock(&rt_rq->rt_runtime_lock); 396 spin_unlock(&rt_rq->rt_runtime_lock);
354 spin_unlock(&rt_b->rt_runtime_lock); 397 spin_unlock(&rt_b->rt_runtime_lock);
355 } 398 }
@@ -388,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
388 int i, idle = 1; 431 int i, idle = 1;
389 cpumask_t span; 432 cpumask_t span;
390 433
391 if (rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
392 return 1; 435 return 1;
393 436
394 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
@@ -486,6 +529,9 @@ static void update_curr_rt(struct rq *rq)
486 curr->se.exec_start = rq->clock; 529 curr->se.exec_start = rq->clock;
487 cpuacct_charge(curr, delta_exec); 530 cpuacct_charge(curr, delta_exec);
488 531
532 if (!rt_bandwidth_enabled())
533 return;
534
489 for_each_sched_rt_entity(rt_se) { 535 for_each_sched_rt_entity(rt_se) {
490 rt_rq = rt_rq_of_se(rt_se); 536 rt_rq = rt_rq_of_se(rt_se);
491 537
@@ -783,7 +829,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
783/* 829/*
784 * Preempt the current task with a newly woken task if needed: 830 * Preempt the current task with a newly woken task if needed:
785 */ 831 */
786static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) 832static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
787{ 833{
788 if (p->prio < rq->curr->prio) { 834 if (p->prio < rq->curr->prio) {
789 resched_task(rq->curr); 835 resched_task(rq->curr);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1876b526c778..f8d968063cea 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev,
72} 72}
73 73
74/** 74/**
75 * clockevents_shutdown - shutdown the device and clear next_event
76 * @dev: device to shutdown
77 */
78void clockevents_shutdown(struct clock_event_device *dev)
79{
80 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
81 dev->next_event.tv64 = KTIME_MAX;
82}
83
84/**
75 * clockevents_program_event - Reprogram the clock event device. 85 * clockevents_program_event - Reprogram the clock event device.
76 * @expires: absolute expiry time (monotonic clock) 86 * @expires: absolute expiry time (monotonic clock)
77 * 87 *
@@ -206,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
206 216
207 if (new) { 217 if (new) {
208 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); 218 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
209 clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); 219 clockevents_shutdown(new);
210 } 220 }
211 local_irq_restore(flags); 221 local_irq_restore(flags);
212} 222}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2f5a38294bf9..cb01cd8f919b 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -235,9 +235,9 @@ static void tick_do_broadcast_on_off(void *why)
235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
236 if (!cpu_isset(cpu, tick_broadcast_mask)) { 236 if (!cpu_isset(cpu, tick_broadcast_mask)) {
237 cpu_set(cpu, tick_broadcast_mask); 237 cpu_set(cpu, tick_broadcast_mask);
238 if (td->mode == TICKDEV_MODE_PERIODIC) 238 if (tick_broadcast_device.mode ==
239 clockevents_set_mode(dev, 239 TICKDEV_MODE_PERIODIC)
240 CLOCK_EVT_MODE_SHUTDOWN); 240 clockevents_shutdown(dev);
241 } 241 }
242 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) 242 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
243 tick_broadcast_force = 1; 243 tick_broadcast_force = 1;
@@ -246,7 +246,8 @@ static void tick_do_broadcast_on_off(void *why)
246 if (!tick_broadcast_force && 246 if (!tick_broadcast_force &&
247 cpu_isset(cpu, tick_broadcast_mask)) { 247 cpu_isset(cpu, tick_broadcast_mask)) {
248 cpu_clear(cpu, tick_broadcast_mask); 248 cpu_clear(cpu, tick_broadcast_mask);
249 if (td->mode == TICKDEV_MODE_PERIODIC) 249 if (tick_broadcast_device.mode ==
250 TICKDEV_MODE_PERIODIC)
250 tick_setup_periodic(dev, 0); 251 tick_setup_periodic(dev, 0);
251 } 252 }
252 break; 253 break;
@@ -254,7 +255,7 @@ static void tick_do_broadcast_on_off(void *why)
254 255
255 if (cpus_empty(tick_broadcast_mask)) { 256 if (cpus_empty(tick_broadcast_mask)) {
256 if (!bc_stopped) 257 if (!bc_stopped)
257 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 258 clockevents_shutdown(bc);
258 } else if (bc_stopped) { 259 } else if (bc_stopped) {
259 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 260 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
260 tick_broadcast_start_periodic(bc); 261 tick_broadcast_start_periodic(bc);
@@ -306,7 +307,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
306 307
307 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
308 if (bc && cpus_empty(tick_broadcast_mask)) 309 if (bc && cpus_empty(tick_broadcast_mask))
309 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 310 clockevents_shutdown(bc);
310 } 311 }
311 312
312 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 313 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -321,7 +322,7 @@ void tick_suspend_broadcast(void)
321 322
322 bc = tick_broadcast_device.evtdev; 323 bc = tick_broadcast_device.evtdev;
323 if (bc) 324 if (bc)
324 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 325 clockevents_shutdown(bc);
325 326
326 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 327 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
327} 328}
@@ -576,4 +577,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
576 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 577 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
577} 578}
578 579
580/*
581 * Check, whether the broadcast device is in one shot mode
582 */
583int tick_broadcast_oneshot_active(void)
584{
585 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
586}
587
579#endif 588#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index c4777193d567..df12434b43ca 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
33 */ 33 */
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36int tick_do_timer_cpu __read_mostly = -1; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37DEFINE_SPINLOCK(tick_device_lock); 37DEFINE_SPINLOCK(tick_device_lock);
38 38
39/* 39/*
@@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
109 if (!tick_device_is_functional(dev)) 109 if (!tick_device_is_functional(dev))
110 return; 110 return;
111 111
112 if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { 112 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
113 !tick_broadcast_oneshot_active()) {
113 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); 114 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
114 } else { 115 } else {
115 unsigned long seq; 116 unsigned long seq;
@@ -148,7 +149,7 @@ static void tick_setup_device(struct tick_device *td,
148 * If no cpu took the do_timer update, assign it to 149 * If no cpu took the do_timer update, assign it to
149 * this cpu: 150 * this cpu:
150 */ 151 */
151 if (tick_do_timer_cpu == -1) { 152 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
152 tick_do_timer_cpu = cpu; 153 tick_do_timer_cpu = cpu;
153 tick_next_period = ktime_get(); 154 tick_next_period = ktime_get();
154 tick_period = ktime_set(0, NSEC_PER_SEC / HZ); 155 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
@@ -249,7 +250,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
249 * not give it back to the clockevents layer ! 250 * not give it back to the clockevents layer !
250 */ 251 */
251 if (tick_is_broadcast_device(curdev)) { 252 if (tick_is_broadcast_device(curdev)) {
252 clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); 253 clockevents_shutdown(curdev);
253 curdev = NULL; 254 curdev = NULL;
254 } 255 }
255 clockevents_exchange_device(curdev, newdev); 256 clockevents_exchange_device(curdev, newdev);
@@ -300,7 +301,8 @@ static void tick_shutdown(unsigned int *cpup)
300 if (*cpup == tick_do_timer_cpu) { 301 if (*cpup == tick_do_timer_cpu) {
301 int cpu = first_cpu(cpu_online_map); 302 int cpu = first_cpu(cpu_online_map);
302 303
303 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; 304 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
305 TICK_DO_TIMER_NONE;
304 } 306 }
305 spin_unlock_irqrestore(&tick_device_lock, flags); 307 spin_unlock_irqrestore(&tick_device_lock, flags);
306} 308}
@@ -311,7 +313,7 @@ static void tick_suspend(void)
311 unsigned long flags; 313 unsigned long flags;
312 314
313 spin_lock_irqsave(&tick_device_lock, flags); 315 spin_lock_irqsave(&tick_device_lock, flags);
314 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN); 316 clockevents_shutdown(td->evtdev);
315 spin_unlock_irqrestore(&tick_device_lock, flags); 317 spin_unlock_irqrestore(&tick_device_lock, flags);
316} 318}
317 319
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 0ffc2918ea6f..469248782c23 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
1/* 1/*
2 * tick internal variable and functions used by low/high res code 2 * tick internal variable and functions used by low/high res code
3 */ 3 */
4
5#define TICK_DO_TIMER_NONE -1
6#define TICK_DO_TIMER_BOOT -2
7
4DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 8DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
5extern spinlock_t tick_device_lock; 9extern spinlock_t tick_device_lock;
6extern ktime_t tick_next_period; 10extern ktime_t tick_next_period;
@@ -10,6 +14,8 @@ extern int tick_do_timer_cpu __read_mostly;
10extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); 14extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
11extern void tick_handle_periodic(struct clock_event_device *dev); 15extern void tick_handle_periodic(struct clock_event_device *dev);
12 16
17extern void clockevents_shutdown(struct clock_event_device *dev);
18
13/* 19/*
14 * NO_HZ / high resolution timer shared code 20 * NO_HZ / high resolution timer shared code
15 */ 21 */
@@ -29,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason);
29extern void tick_broadcast_switch_to_oneshot(void); 35extern void tick_broadcast_switch_to_oneshot(void);
30extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 36extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
31extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 37extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
38extern int tick_broadcast_oneshot_active(void);
32# else /* BROADCAST */ 39# else /* BROADCAST */
33static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
34{ 41{
@@ -37,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
37static inline void tick_broadcast_oneshot_control(unsigned long reason) { } 44static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
38static inline void tick_broadcast_switch_to_oneshot(void) { } 45static inline void tick_broadcast_switch_to_oneshot(void) { }
39static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 46static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
47static inline int tick_broadcast_oneshot_active(void) { return 0; }
40# endif /* !BROADCAST */ 48# endif /* !BROADCAST */
41 49
42#else /* !ONESHOT */ 50#else /* !ONESHOT */
@@ -66,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
66{ 74{
67 return 0; 75 return 0;
68} 76}
77static inline int tick_broadcast_oneshot_active(void) { return 0; }
69#endif /* !TICK_ONESHOT */ 78#endif /* !TICK_ONESHOT */
70 79
71/* 80/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e35501e61dd..2e8de678e767 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -43,19 +43,17 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
43 * and emit a warning. 43 * and emit a warning.
44 */ 44 */
45 if (++i > 2) { 45 if (++i > 2) {
46 printk(KERN_WARNING "CE: __tick_program_event of %s is " 46 /* Increase the min. delta and try again */
47 "stuck %llx %llx\n", dev->name ? dev->name : "?",
48 now.tv64, expires.tv64);
49 printk(KERN_WARNING
50 "CE: increasing min_delta_ns %ld to %ld nsec\n",
51 dev->min_delta_ns, dev->min_delta_ns << 1);
52 WARN_ON(1);
53
54 /* Double the min. delta and try again */
55 if (!dev->min_delta_ns) 47 if (!dev->min_delta_ns)
56 dev->min_delta_ns = 5000; 48 dev->min_delta_ns = 5000;
57 else 49 else
58 dev->min_delta_ns <<= 1; 50 dev->min_delta_ns += dev->min_delta_ns >> 1;
51
52 printk(KERN_WARNING
53 "CE: %s increasing min_delta_ns to %lu nsec\n",
54 dev->name ? dev->name : "?",
55 dev->min_delta_ns << 1);
56
59 i = 0; 57 i = 0;
60 } 58 }
61 59
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a87b0468568b..cb02324bdb88 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -75,6 +75,9 @@ static void tick_do_update_jiffies64(ktime_t now)
75 incr * ticks); 75 incr * ticks);
76 } 76 }
77 do_timer(++ticks); 77 do_timer(++ticks);
78
79 /* Keep the tick_next_period variable up to date */
80 tick_next_period = ktime_add(last_jiffies_update, tick_period);
78 } 81 }
79 write_sequnlock(&xtime_lock); 82 write_sequnlock(&xtime_lock);
80} 83}
@@ -221,7 +224,7 @@ void tick_nohz_stop_sched_tick(int inidle)
221 */ 224 */
222 if (unlikely(!cpu_online(cpu))) { 225 if (unlikely(!cpu_online(cpu))) {
223 if (cpu == tick_do_timer_cpu) 226 if (cpu == tick_do_timer_cpu)
224 tick_do_timer_cpu = -1; 227 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
225 } 228 }
226 229
227 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 230 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
@@ -303,7 +306,7 @@ void tick_nohz_stop_sched_tick(int inidle)
303 * invoked. 306 * invoked.
304 */ 307 */
305 if (cpu == tick_do_timer_cpu) 308 if (cpu == tick_do_timer_cpu)
306 tick_do_timer_cpu = -1; 309 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
307 310
308 ts->idle_sleeps++; 311 ts->idle_sleeps++;
309 312
@@ -468,7 +471,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
468 * this duty, then the jiffies update is still serialized by 471 * this duty, then the jiffies update is still serialized by
469 * xtime_lock. 472 * xtime_lock.
470 */ 473 */
471 if (unlikely(tick_do_timer_cpu == -1)) 474 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
472 tick_do_timer_cpu = cpu; 475 tick_do_timer_cpu = cpu;
473 476
474 /* Check, if the jiffies need an update */ 477 /* Check, if the jiffies need an update */
@@ -570,7 +573,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
570 * this duty, then the jiffies update is still serialized by 573 * this duty, then the jiffies update is still serialized by
571 * xtime_lock. 574 * xtime_lock.
572 */ 575 */
573 if (unlikely(tick_do_timer_cpu == -1)) 576 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
574 tick_do_timer_cpu = cpu; 577 tick_do_timer_cpu = cpu;
575#endif 578#endif
576 579
@@ -622,7 +625,7 @@ void tick_setup_sched_timer(void)
622 */ 625 */
623 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 626 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
624 ts->sched_timer.function = tick_sched_timer; 627 ts->sched_timer.function = tick_sched_timer;
625 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 628 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
626 629
627 /* Get the next period (per cpu) */ 630 /* Get the next period (per cpu) */
628 ts->sched_timer.expires = tick_init_jiffy_update(); 631 ts->sched_timer.expires = tick_init_jiffy_update();
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index bb948e52ce20..db58fb66a135 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,7 @@ static void start_stack_timer(int cpu)
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
206 206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208} 208}
diff --git a/kernel/user.c b/kernel/user.c
index 865ecf57a096..39d6159fae43 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169{ 169{
170 struct user_struct *up = container_of(kobj, struct user_struct, kobj); 170 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
171 171
172 return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); 172 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
173} 173}
174 174
175static ssize_t cpu_rt_runtime_store(struct kobject *kobj, 175static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
180 unsigned long rt_runtime; 180 unsigned long rt_runtime;
181 int rc; 181 int rc;
182 182
183 sscanf(buf, "%lu", &rt_runtime); 183 sscanf(buf, "%ld", &rt_runtime);
184 184
185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime); 185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
186 186