aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2010-11-13 13:32:29 -0500
committerIngo Molnar <mingo@elte.hu>2010-11-18 07:27:46 -0500
commit48c5ccae88dcd989d9de507e8510313c6cbd352b (patch)
tree06fe8ce2ac28e9f5844de8bc32ecbef97e40d68b
parent92fd4d4d67b945c0766416284d4ab236b31542c4 (diff)
sched: Simplify cpu-hot-unplug task migration
While discussing the need for sched_idle_next(), Oleg remarked that since try_to_wake_up() ensures sleeping tasks will end up running on a sane cpu, we can do away with migrate_live_tasks(). If we then extend the existing hack of migrating current from CPU_DYING to migrating the full rq worth of tasks from CPU_DYING, the need for the sched_idle_next() abomination disappears as well, since idle will be the only possible thread left after the migration thread stops. This greatly simplifies the hot-unplug task migration path, as can be seen from the resulting code reduction (and about half the new lines are comments). Suggested-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1289851597.2109.547.camel@laptop> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/sched.h3
-rw-r--r--kernel/cpu.c16
-rw-r--r--kernel/sched.c206
3 files changed, 67 insertions, 158 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3cd70cf91fd..29d953abb5a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1871,14 +1871,11 @@ extern void sched_clock_idle_sleep_event(void);
1871extern void sched_clock_idle_wakeup_event(u64 delta_ns); 1871extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1872 1872
1873#ifdef CONFIG_HOTPLUG_CPU 1873#ifdef CONFIG_HOTPLUG_CPU
1874extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
1875extern void idle_task_exit(void); 1874extern void idle_task_exit(void);
1876#else 1875#else
1877static inline void idle_task_exit(void) {} 1876static inline void idle_task_exit(void) {}
1878#endif 1877#endif
1879 1878
1880extern void sched_idle_next(void);
1881
1882#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 1879#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
1883extern void wake_up_idle_cpu(int cpu); 1880extern void wake_up_idle_cpu(int cpu);
1884#else 1881#else
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f1849..8615aa65d92 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
189} 189}
190 190
191struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
193 unsigned long mod; 192 unsigned long mod;
194 void *hcpu; 193 void *hcpu;
195}; 194};
@@ -208,11 +207,6 @@ static int __ref take_cpu_down(void *_param)
208 207
209 cpu_notify(CPU_DYING | param->mod, param->hcpu); 208 cpu_notify(CPU_DYING | param->mod, param->hcpu);
210 209
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
213 /* Force idle task to run as soon as we yield: it should
214 immediately notice cpu is offline and die quickly. */
215 sched_idle_next();
216 return 0; 210 return 0;
217} 211}
218 212
@@ -223,7 +217,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 void *hcpu = (void *)(long)cpu; 217 void *hcpu = (void *)(long)cpu;
224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 218 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
225 struct take_cpu_down_param tcd_param = { 219 struct take_cpu_down_param tcd_param = {
226 .caller = current,
227 .mod = mod, 220 .mod = mod,
228 .hcpu = hcpu, 221 .hcpu = hcpu,
229 }; 222 };
@@ -253,9 +246,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
253 } 246 }
254 BUG_ON(cpu_online(cpu)); 247 BUG_ON(cpu_online(cpu));
255 248
256 /* Wait for it to sleep (leaving idle task). */ 249 /*
257 while (!idle_cpu(cpu)) 250 * The migration_call() CPU_DYING callback will have removed all
258 yield(); 251 * runnable tasks from the cpu, there's only the idle task left now
252 * that the migration thread is done doing the stop_machine thing.
253 */
254 BUG_ON(!idle_cpu(cpu));
259 255
260 /* This actually kills the CPU. */ 256 /* This actually kills the CPU. */
261 __cpu_die(cpu); 257 __cpu_die(cpu);
diff --git a/kernel/sched.c b/kernel/sched.c
index 41f18695b73..b0d5f1b24a3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2366,18 +2366,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2366 return dest_cpu; 2366 return dest_cpu;
2367 2367
2368 /* No more Mr. Nice Guy. */ 2368 /* No more Mr. Nice Guy. */
2369 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2369 dest_cpu = cpuset_cpus_allowed_fallback(p);
2370 dest_cpu = cpuset_cpus_allowed_fallback(p); 2370 /*
2371 /* 2371 * Don't tell them about moving exiting tasks or
2372 * Don't tell them about moving exiting tasks or 2372 * kernel threads (both mm NULL), since they never
2373 * kernel threads (both mm NULL), since they never 2373 * leave kernel.
2374 * leave kernel. 2374 */
2375 */ 2375 if (p->mm && printk_ratelimit()) {
2376 if (p->mm && printk_ratelimit()) { 2376 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2377 printk(KERN_INFO "process %d (%s) no " 2377 task_pid_nr(p), p->comm, cpu);
2378 "longer affine to cpu%d\n",
2379 task_pid_nr(p), p->comm, cpu);
2380 }
2381 } 2378 }
2382 2379
2383 return dest_cpu; 2380 return dest_cpu;
@@ -5712,29 +5709,20 @@ static int migration_cpu_stop(void *data)
5712} 5709}
5713 5710
5714#ifdef CONFIG_HOTPLUG_CPU 5711#ifdef CONFIG_HOTPLUG_CPU
5712
5715/* 5713/*
5716 * Figure out where task on dead CPU should go, use force if necessary. 5714 * Ensures that the idle task is using init_mm right before its cpu goes
5715 * offline.
5717 */ 5716 */
5718void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5717void idle_task_exit(void)
5719{ 5718{
5720 struct rq *rq = cpu_rq(dead_cpu); 5719 struct mm_struct *mm = current->active_mm;
5721 int needs_cpu, uninitialized_var(dest_cpu);
5722 unsigned long flags;
5723 5720
5724 local_irq_save(flags); 5721 BUG_ON(cpu_online(smp_processor_id()));
5725 5722
5726 raw_spin_lock(&rq->lock); 5723 if (mm != &init_mm)
5727 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5724 switch_mm(mm, &init_mm, current);
5728 if (needs_cpu) 5725 mmdrop(mm);
5729 dest_cpu = select_fallback_rq(dead_cpu, p);
5730 raw_spin_unlock(&rq->lock);
5731 /*
5732 * It can only fail if we race with set_cpus_allowed(),
5733 * in the racer should migrate the task anyway.
5734 */
5735 if (needs_cpu)
5736 __migrate_task(p, dead_cpu, dest_cpu);
5737 local_irq_restore(flags);
5738} 5726}
5739 5727
5740/* 5728/*
@@ -5747,128 +5735,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5747static void migrate_nr_uninterruptible(struct rq *rq_src) 5735static void migrate_nr_uninterruptible(struct rq *rq_src)
5748{ 5736{
5749 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5737 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5750 unsigned long flags;
5751 5738
5752 local_irq_save(flags);
5753 double_rq_lock(rq_src, rq_dest);
5754 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5739 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5755 rq_src->nr_uninterruptible = 0; 5740 rq_src->nr_uninterruptible = 0;
5756 double_rq_unlock(rq_src, rq_dest);
5757 local_irq_restore(flags);
5758}
5759
5760/* Run through task list and migrate tasks from the dead cpu. */
5761static void migrate_live_tasks(int src_cpu)
5762{
5763 struct task_struct *p, *t;
5764
5765 read_lock(&tasklist_lock);
5766
5767 do_each_thread(t, p) {
5768 if (p == current)
5769 continue;
5770
5771 if (task_cpu(p) == src_cpu)
5772 move_task_off_dead_cpu(src_cpu, p);
5773 } while_each_thread(t, p);
5774
5775 read_unlock(&tasklist_lock);
5776} 5741}
5777 5742
5778/* 5743/*
5779 * Schedules idle task to be the next runnable task on current CPU. 5744 * remove the tasks which were accounted by rq from calc_load_tasks.
5780 * It does so by boosting its priority to highest possible.
5781 * Used by CPU offline code.
5782 */ 5745 */
5783void sched_idle_next(void) 5746static void calc_global_load_remove(struct rq *rq)
5784{ 5747{
5785 int this_cpu = smp_processor_id(); 5748 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5786 struct rq *rq = cpu_rq(this_cpu); 5749 rq->calc_load_active = 0;
5787 struct task_struct *p = rq->idle;
5788 unsigned long flags;
5789
5790 /* cpu has to be offline */
5791 BUG_ON(cpu_online(this_cpu));
5792
5793 /*
5794 * Strictly not necessary since rest of the CPUs are stopped by now
5795 * and interrupts disabled on the current cpu.
5796 */
5797 raw_spin_lock_irqsave(&rq->lock, flags);
5798
5799 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5800
5801 activate_task(rq, p, 0);
5802
5803 raw_spin_unlock_irqrestore(&rq->lock, flags);
5804} 5750}
5805 5751
5806/* 5752/*
5807 * Ensures that the idle task is using init_mm right before its cpu goes 5753 * Migrate all tasks from the rq, sleeping tasks will be migrated by
5808 * offline. 5754 * try_to_wake_up()->select_task_rq().
5755 *
5756 * Called with rq->lock held even though we'er in stop_machine() and
5757 * there's no concurrency possible, we hold the required locks anyway
5758 * because of lock validation efforts.
5809 */ 5759 */
5810void idle_task_exit(void) 5760static void migrate_tasks(unsigned int dead_cpu)
5811{
5812 struct mm_struct *mm = current->active_mm;
5813
5814 BUG_ON(cpu_online(smp_processor_id()));
5815
5816 if (mm != &init_mm)
5817 switch_mm(mm, &init_mm, current);
5818 mmdrop(mm);
5819}
5820
5821/* called under rq->lock with disabled interrupts */
5822static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5823{ 5761{
5824 struct rq *rq = cpu_rq(dead_cpu); 5762 struct rq *rq = cpu_rq(dead_cpu);
5825 5763 struct task_struct *next, *stop = rq->stop;
5826 /* Must be exiting, otherwise would be on tasklist. */ 5764 int dest_cpu;
5827 BUG_ON(!p->exit_state);
5828
5829 /* Cannot have done final schedule yet: would have vanished. */
5830 BUG_ON(p->state == TASK_DEAD);
5831
5832 get_task_struct(p);
5833 5765
5834 /* 5766 /*
5835 * Drop lock around migration; if someone else moves it, 5767 * Fudge the rq selection such that the below task selection loop
5836 * that's OK. No task can be added to this CPU, so iteration is 5768 * doesn't get stuck on the currently eligible stop task.
5837 * fine. 5769 *
5770 * We're currently inside stop_machine() and the rq is either stuck
5771 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5772 * either way we should never end up calling schedule() until we're
5773 * done here.
5838 */ 5774 */
5839 raw_spin_unlock_irq(&rq->lock); 5775 rq->stop = NULL;
5840 move_task_off_dead_cpu(dead_cpu, p);
5841 raw_spin_lock_irq(&rq->lock);
5842
5843 put_task_struct(p);
5844}
5845
5846/* release_task() removes task from tasklist, so we won't find dead tasks. */
5847static void migrate_dead_tasks(unsigned int dead_cpu)
5848{
5849 struct rq *rq = cpu_rq(dead_cpu);
5850 struct task_struct *next;
5851 5776
5852 for ( ; ; ) { 5777 for ( ; ; ) {
5853 if (!rq->nr_running) 5778 /*
5779 * There's this thread running, bail when that's the only
5780 * remaining thread.
5781 */
5782 if (rq->nr_running == 1)
5854 break; 5783 break;
5784
5855 next = pick_next_task(rq); 5785 next = pick_next_task(rq);
5856 if (!next) 5786 BUG_ON(!next);
5857 break;
5858 next->sched_class->put_prev_task(rq, next); 5787 next->sched_class->put_prev_task(rq, next);
5859 migrate_dead(dead_cpu, next);
5860 5788
5789 /* Find suitable destination for @next, with force if needed. */
5790 dest_cpu = select_fallback_rq(dead_cpu, next);
5791 raw_spin_unlock(&rq->lock);
5792
5793 __migrate_task(next, dead_cpu, dest_cpu);
5794
5795 raw_spin_lock(&rq->lock);
5861 } 5796 }
5862}
5863 5797
5864/* 5798 rq->stop = stop;
5865 * remove the tasks which were accounted by rq from calc_load_tasks.
5866 */
5867static void calc_global_load_remove(struct rq *rq)
5868{
5869 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5870 rq->calc_load_active = 0;
5871} 5799}
5800
5872#endif /* CONFIG_HOTPLUG_CPU */ 5801#endif /* CONFIG_HOTPLUG_CPU */
5873 5802
5874#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5803#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6078,15 +6007,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6078 unsigned long flags; 6007 unsigned long flags;
6079 struct rq *rq = cpu_rq(cpu); 6008 struct rq *rq = cpu_rq(cpu);
6080 6009
6081 switch (action) { 6010 switch (action & ~CPU_TASKS_FROZEN) {
6082 6011
6083 case CPU_UP_PREPARE: 6012 case CPU_UP_PREPARE:
6084 case CPU_UP_PREPARE_FROZEN:
6085 rq->calc_load_update = calc_load_update; 6013 rq->calc_load_update = calc_load_update;
6086 break; 6014 break;
6087 6015
6088 case CPU_ONLINE: 6016 case CPU_ONLINE:
6089 case CPU_ONLINE_FROZEN:
6090 /* Update our root-domain */ 6017 /* Update our root-domain */
6091 raw_spin_lock_irqsave(&rq->lock, flags); 6018 raw_spin_lock_irqsave(&rq->lock, flags);
6092 if (rq->rd) { 6019 if (rq->rd) {
@@ -6098,30 +6025,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6098 break; 6025 break;
6099 6026
6100#ifdef CONFIG_HOTPLUG_CPU 6027#ifdef CONFIG_HOTPLUG_CPU
6101 case CPU_DEAD:
6102 case CPU_DEAD_FROZEN:
6103 migrate_live_tasks(cpu);
6104 /* Idle task back to normal (off runqueue, low prio) */
6105 raw_spin_lock_irq(&rq->lock);
6106 deactivate_task(rq, rq->idle, 0);
6107 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6108 rq->idle->sched_class = &idle_sched_class;
6109 migrate_dead_tasks(cpu);
6110 raw_spin_unlock_irq(&rq->lock);
6111 migrate_nr_uninterruptible(rq);
6112 BUG_ON(rq->nr_running != 0);
6113 calc_global_load_remove(rq);
6114 break;
6115
6116 case CPU_DYING: 6028 case CPU_DYING:
6117 case CPU_DYING_FROZEN:
6118 /* Update our root-domain */ 6029 /* Update our root-domain */
6119 raw_spin_lock_irqsave(&rq->lock, flags); 6030 raw_spin_lock_irqsave(&rq->lock, flags);
6120 if (rq->rd) { 6031 if (rq->rd) {
6121 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6032 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6122 set_rq_offline(rq); 6033 set_rq_offline(rq);
6123 } 6034 }
6035 migrate_tasks(cpu);
6036 BUG_ON(rq->nr_running != 1); /* the migration thread */
6124 raw_spin_unlock_irqrestore(&rq->lock, flags); 6037 raw_spin_unlock_irqrestore(&rq->lock, flags);
6038
6039 migrate_nr_uninterruptible(rq);
6040 calc_global_load_remove(rq);
6125 break; 6041 break;
6126#endif 6042#endif
6127 } 6043 }