aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorMax Krasnyansky <maxk@qualcomm.com>2008-07-15 07:43:49 -0400
committerIngo Molnar <mingo@elte.hu>2008-07-18 07:22:25 -0400
commite761b7725234276a802322549cee5255305a0930 (patch)
tree27b351a7d5fc9a93590e0effce1c5adb1bfcebc0 /kernel
parent7ebefa8ceefed44cc321be70afc54a585a68ac0b (diff)
cpu hotplug, sched: Introduce cpu_active_map and redo sched domain managment (take 2)
This is based on Linus' idea of creating cpu_active_map that prevents scheduler load balancer from migrating tasks to the cpu that is going down. It allows us to simplify domain management code and avoid unecessary domain rebuilds during cpu hotplug event handling. Please ignore the cpusets part for now. It needs some more work in order to avoid crazy lock nesting. Although I did simplfy and unify domain reinitialization logic. We now simply call partition_sched_domains() in all the cases. This means that we're using exact same code paths as in cpusets case and hence the test below cover cpusets too. Cpuset changes to make rebuild_sched_domains() callable from various contexts are in the separate patch (right next after this one). This not only boots but also easily handles while true; do make clean; make -j 8; done and while true; do on-off-cpu 1; done at the same time. (on-off-cpu 1 simple does echo 0/1 > /sys/.../cpu1/online thing). Suprisingly the box (dual-core Core2) is quite usable. In fact I'm typing this on right now in gnome-terminal and things are moving just fine. Also this is running with most of the debug features enabled (lockdep, mutex, etc) no BUG_ONs or lockdep complaints so far. I believe I addressed all of the Dmitry's comments for original Linus' version. I changed both fair and rt balancer to mask out non-active cpus. And replaced cpu_is_offline() with !cpu_active() in the main scheduler code where it made sense (to me). Signed-off-by: Max Krasnyanskiy <maxk@qualcomm.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Gregory Haskins <ghaskins@novell.com> Cc: dmitry.adamushko@gmail.com Cc: pj@sgi.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c30
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/sched.c108
-rw-r--r--kernel/sched_fair.c3
-rw-r--r--kernel/sched_rt.c7
5 files changed, 80 insertions, 70 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index cfb1d43ab80..a1ac7ea245d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void)
64 cpu_hotplug.refcount = 0; 64 cpu_hotplug.refcount = 0;
65} 65}
66 66
67cpumask_t cpu_active_map;
68
67#ifdef CONFIG_HOTPLUG_CPU 69#ifdef CONFIG_HOTPLUG_CPU
68 70
69void get_online_cpus(void) 71void get_online_cpus(void)
@@ -291,11 +293,20 @@ int __ref cpu_down(unsigned int cpu)
291 int err = 0; 293 int err = 0;
292 294
293 cpu_maps_update_begin(); 295 cpu_maps_update_begin();
294 if (cpu_hotplug_disabled) 296
297 if (cpu_hotplug_disabled) {
295 err = -EBUSY; 298 err = -EBUSY;
296 else 299 goto out;
297 err = _cpu_down(cpu, 0); 300 }
301
302 cpu_clear(cpu, cpu_active_map);
303
304 err = _cpu_down(cpu, 0);
305
306 if (cpu_online(cpu))
307 cpu_set(cpu, cpu_active_map);
298 308
309out:
299 cpu_maps_update_done(); 310 cpu_maps_update_done();
300 return err; 311 return err;
301} 312}
@@ -355,11 +366,18 @@ int __cpuinit cpu_up(unsigned int cpu)
355 } 366 }
356 367
357 cpu_maps_update_begin(); 368 cpu_maps_update_begin();
358 if (cpu_hotplug_disabled) 369
370 if (cpu_hotplug_disabled) {
359 err = -EBUSY; 371 err = -EBUSY;
360 else 372 goto out;
361 err = _cpu_up(cpu, 0); 373 }
362 374
375 err = _cpu_up(cpu, 0);
376
377 if (cpu_online(cpu))
378 cpu_set(cpu, cpu_active_map);
379
380out:
363 cpu_maps_update_done(); 381 cpu_maps_update_done();
364 return err; 382 return err;
365} 383}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 459d601947a..3c3ef02f65f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
564 * partition_sched_domains(). 564 * partition_sched_domains().
565 */ 565 */
566 566
567static void rebuild_sched_domains(void) 567void rebuild_sched_domains(void)
568{ 568{
569 struct kfifo *q; /* queue of cpusets to be scanned */ 569 struct kfifo *q; /* queue of cpusets to be scanned */
570 struct cpuset *cp; /* scans q */ 570 struct cpuset *cp; /* scans q */
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ee18dbb451..c237624a8a0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2881,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2881 2881
2882 rq = task_rq_lock(p, &flags); 2882 rq = task_rq_lock(p, &flags);
2883 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2883 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2884 || unlikely(cpu_is_offline(dest_cpu))) 2884 || unlikely(!cpu_active(dest_cpu)))
2885 goto out; 2885 goto out;
2886 2886
2887 /* force the process onto the specified CPU */ 2887 /* force the process onto the specified CPU */
@@ -3849,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick)
3849 /* 3849 /*
3850 * If we are going offline and still the leader, give up! 3850 * If we are going offline and still the leader, give up!
3851 */ 3851 */
3852 if (cpu_is_offline(cpu) && 3852 if (!cpu_active(cpu) &&
3853 atomic_read(&nohz.load_balancer) == cpu) { 3853 atomic_read(&nohz.load_balancer) == cpu) {
3854 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3854 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3855 BUG(); 3855 BUG();
@@ -5876,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5876 struct rq *rq_dest, *rq_src; 5876 struct rq *rq_dest, *rq_src;
5877 int ret = 0, on_rq; 5877 int ret = 0, on_rq;
5878 5878
5879 if (unlikely(cpu_is_offline(dest_cpu))) 5879 if (unlikely(!cpu_active(dest_cpu)))
5880 return ret; 5880 return ret;
5881 5881
5882 rq_src = cpu_rq(src_cpu); 5882 rq_src = cpu_rq(src_cpu);
@@ -7554,18 +7554,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7554} 7554}
7555 7555
7556/* 7556/*
7557 * Free current domain masks.
7558 * Called after all cpus are attached to NULL domain.
7559 */
7560static void free_sched_domains(void)
7561{
7562 ndoms_cur = 0;
7563 if (doms_cur != &fallback_doms)
7564 kfree(doms_cur);
7565 doms_cur = &fallback_doms;
7566}
7567
7568/*
7569 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7557 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7570 * For now this just excludes isolated cpus, but could be used to 7558 * For now this just excludes isolated cpus, but could be used to
7571 * exclude other special cases in the future. 7559 * exclude other special cases in the future.
@@ -7643,7 +7631,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7643 * ownership of it and will kfree it when done with it. If the caller 7631 * ownership of it and will kfree it when done with it. If the caller
7644 * failed the kmalloc call, then it can pass in doms_new == NULL, 7632 * failed the kmalloc call, then it can pass in doms_new == NULL,
7645 * and partition_sched_domains() will fallback to the single partition 7633 * and partition_sched_domains() will fallback to the single partition
7646 * 'fallback_doms'. 7634 * 'fallback_doms', it also forces the domains to be rebuilt.
7647 * 7635 *
7648 * Call with hotplug lock held 7636 * Call with hotplug lock held
7649 */ 7637 */
@@ -7657,12 +7645,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7657 /* always unregister in case we don't destroy any domains */ 7645 /* always unregister in case we don't destroy any domains */
7658 unregister_sched_domain_sysctl(); 7646 unregister_sched_domain_sysctl();
7659 7647
7660 if (doms_new == NULL) { 7648 if (doms_new == NULL)
7661 ndoms_new = 1; 7649 ndoms_new = 0;
7662 doms_new = &fallback_doms;
7663 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7664 dattr_new = NULL;
7665 }
7666 7650
7667 /* Destroy deleted domains */ 7651 /* Destroy deleted domains */
7668 for (i = 0; i < ndoms_cur; i++) { 7652 for (i = 0; i < ndoms_cur; i++) {
@@ -7677,6 +7661,14 @@ match1:
7677 ; 7661 ;
7678 } 7662 }
7679 7663
7664 if (doms_new == NULL) {
7665 ndoms_cur = 0;
7666 ndoms_new = 1;
7667 doms_new = &fallback_doms;
7668 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7669 dattr_new = NULL;
7670 }
7671
7680 /* Build new domains */ 7672 /* Build new domains */
7681 for (i = 0; i < ndoms_new; i++) { 7673 for (i = 0; i < ndoms_new; i++) {
7682 for (j = 0; j < ndoms_cur; j++) { 7674 for (j = 0; j < ndoms_cur; j++) {
@@ -7707,17 +7699,10 @@ match2:
7707#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7699#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7708int arch_reinit_sched_domains(void) 7700int arch_reinit_sched_domains(void)
7709{ 7701{
7710 int err;
7711
7712 get_online_cpus(); 7702 get_online_cpus();
7713 mutex_lock(&sched_domains_mutex); 7703 rebuild_sched_domains();
7714 detach_destroy_domains(&cpu_online_map);
7715 free_sched_domains();
7716 err = arch_init_sched_domains(&cpu_online_map);
7717 mutex_unlock(&sched_domains_mutex);
7718 put_online_cpus(); 7704 put_online_cpus();
7719 7705 return 0;
7720 return err;
7721} 7706}
7722 7707
7723static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 7708static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7783,59 +7768,49 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7783} 7768}
7784#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7769#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7785 7770
7771#ifndef CONFIG_CPUSETS
7786/* 7772/*
7787 * Force a reinitialization of the sched domains hierarchy. The domains 7773 * Add online and remove offline CPUs from the scheduler domains.
7788 * and groups cannot be updated in place without racing with the balancing 7774 * When cpusets are enabled they take over this function.
7789 * code, so we temporarily attach all running cpus to the NULL domain
7790 * which will prevent rebalancing while the sched domains are recalculated.
7791 */ 7775 */
7792static int update_sched_domains(struct notifier_block *nfb, 7776static int update_sched_domains(struct notifier_block *nfb,
7793 unsigned long action, void *hcpu) 7777 unsigned long action, void *hcpu)
7794{ 7778{
7779 switch (action) {
7780 case CPU_ONLINE:
7781 case CPU_ONLINE_FROZEN:
7782 case CPU_DEAD:
7783 case CPU_DEAD_FROZEN:
7784 partition_sched_domains(0, NULL, NULL);
7785 return NOTIFY_OK;
7786
7787 default:
7788 return NOTIFY_DONE;
7789 }
7790}
7791#endif
7792
7793static int update_runtime(struct notifier_block *nfb,
7794 unsigned long action, void *hcpu)
7795{
7795 int cpu = (int)(long)hcpu; 7796 int cpu = (int)(long)hcpu;
7796 7797
7797 switch (action) { 7798 switch (action) {
7798 case CPU_DOWN_PREPARE: 7799 case CPU_DOWN_PREPARE:
7799 case CPU_DOWN_PREPARE_FROZEN: 7800 case CPU_DOWN_PREPARE_FROZEN:
7800 disable_runtime(cpu_rq(cpu)); 7801 disable_runtime(cpu_rq(cpu));
7801 /* fall-through */
7802 case CPU_UP_PREPARE:
7803 case CPU_UP_PREPARE_FROZEN:
7804 detach_destroy_domains(&cpu_online_map);
7805 free_sched_domains();
7806 return NOTIFY_OK; 7802 return NOTIFY_OK;
7807 7803
7808
7809 case CPU_DOWN_FAILED: 7804 case CPU_DOWN_FAILED:
7810 case CPU_DOWN_FAILED_FROZEN: 7805 case CPU_DOWN_FAILED_FROZEN:
7811 case CPU_ONLINE: 7806 case CPU_ONLINE:
7812 case CPU_ONLINE_FROZEN: 7807 case CPU_ONLINE_FROZEN:
7813 enable_runtime(cpu_rq(cpu)); 7808 enable_runtime(cpu_rq(cpu));
7814 /* fall-through */ 7809 return NOTIFY_OK;
7815 case CPU_UP_CANCELED: 7810
7816 case CPU_UP_CANCELED_FROZEN:
7817 case CPU_DEAD:
7818 case CPU_DEAD_FROZEN:
7819 /*
7820 * Fall through and re-initialise the domains.
7821 */
7822 break;
7823 default: 7811 default:
7824 return NOTIFY_DONE; 7812 return NOTIFY_DONE;
7825 } 7813 }
7826
7827#ifndef CONFIG_CPUSETS
7828 /*
7829 * Create default domain partitioning if cpusets are disabled.
7830 * Otherwise we let cpusets rebuild the domains based on the
7831 * current setup.
7832 */
7833
7834 /* The hotplug lock is already held by cpu_up/cpu_down */
7835 arch_init_sched_domains(&cpu_online_map);
7836#endif
7837
7838 return NOTIFY_OK;
7839} 7814}
7840 7815
7841void __init sched_init_smp(void) 7816void __init sched_init_smp(void)
@@ -7855,8 +7830,15 @@ void __init sched_init_smp(void)
7855 cpu_set(smp_processor_id(), non_isolated_cpus); 7830 cpu_set(smp_processor_id(), non_isolated_cpus);
7856 mutex_unlock(&sched_domains_mutex); 7831 mutex_unlock(&sched_domains_mutex);
7857 put_online_cpus(); 7832 put_online_cpus();
7833
7834#ifndef CONFIG_CPUSETS
7858 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7835 /* XXX: Theoretical race here - CPU may be hotplugged now */
7859 hotcpu_notifier(update_sched_domains, 0); 7836 hotcpu_notifier(update_sched_domains, 0);
7837#endif
7838
7839 /* RT runtime code needs to handle some hotplug events */
7840 hotcpu_notifier(update_runtime, 0);
7841
7860 init_hrtick(); 7842 init_hrtick();
7861 7843
7862 /* Move init over to a non-isolated CPU */ 7844 /* Move init over to a non-isolated CPU */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2aa987027d..d924c679dfa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1004,6 +1004,8 @@ static void yield_task_fair(struct rq *rq)
1004 * not idle and an idle cpu is available. The span of cpus to 1004 * not idle and an idle cpu is available. The span of cpus to
1005 * search starts with cpus closest then further out as needed, 1005 * search starts with cpus closest then further out as needed,
1006 * so we always favor a closer, idle cpu. 1006 * so we always favor a closer, idle cpu.
1007 * Domains may include CPUs that are not usable for migration,
1008 * hence we need to mask them out (cpu_active_map)
1007 * 1009 *
1008 * Returns the CPU we should wake onto. 1010 * Returns the CPU we should wake onto.
1009 */ 1011 */
@@ -1031,6 +1033,7 @@ static int wake_idle(int cpu, struct task_struct *p)
1031 || ((sd->flags & SD_WAKE_IDLE_FAR) 1033 || ((sd->flags & SD_WAKE_IDLE_FAR)
1032 && !task_hot(p, task_rq(p)->clock, sd))) { 1034 && !task_hot(p, task_rq(p)->clock, sd))) {
1033 cpus_and(tmp, sd->span, p->cpus_allowed); 1035 cpus_and(tmp, sd->span, p->cpus_allowed);
1036 cpus_and(tmp, tmp, cpu_active_map);
1034 for_each_cpu_mask(i, tmp) { 1037 for_each_cpu_mask(i, tmp) {
1035 if (idle_cpu(i)) { 1038 if (idle_cpu(i)) {
1036 if (i != task_cpu(p)) { 1039 if (i != task_cpu(p)) {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d3d1cccb3d7..50735bb9614 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -934,6 +934,13 @@ static int find_lowest_rq(struct task_struct *task)
934 return -1; /* No targets found */ 934 return -1; /* No targets found */
935 935
936 /* 936 /*
937 * Only consider CPUs that are usable for migration.
938 * I guess we might want to change cpupri_find() to ignore those
939 * in the first place.
940 */
941 cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
942
943 /*
937 * At this point we have built a mask of cpus representing the 944 * At this point we have built a mask of cpus representing the
938 * lowest priority tasks in the system. Now we want to elect 945 * lowest priority tasks in the system. Now we want to elect
939 * the best one based on our affinity and topology. 946 * the best one based on our affinity and topology.