diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2010-11-13 13:32:29 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-11-18 07:27:46 -0500 |
commit | 48c5ccae88dcd989d9de507e8510313c6cbd352b (patch) | |
tree | 06fe8ce2ac28e9f5844de8bc32ecbef97e40d68b /kernel | |
parent | 92fd4d4d67b945c0766416284d4ab236b31542c4 (diff) |
sched: Simplify cpu-hot-unplug task migration
While discussing the need for sched_idle_next(), Oleg remarked that
since try_to_wake_up() ensures sleeping tasks will end up running on a
sane cpu, we can do away with migrate_live_tasks().
If we then extend the existing hack of migrating current from
CPU_DYING to migrating the full rq worth of tasks from CPU_DYING, the
need for the sched_idle_next() abomination disappears as well, since
idle will be the only possible thread left after the migration thread
stops.
This greatly simplifies the hot-unplug task migration path, as can be
seen from the resulting code reduction (and about half the new lines
are comments).
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1289851597.2109.547.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpu.c | 16 | ||||
-rw-r--r-- | kernel/sched.c | 206 |
2 files changed, 67 insertions, 155 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c index f6e726f18491..8615aa65d927 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu) | |||
189 | } | 189 | } |
190 | 190 | ||
191 | struct take_cpu_down_param { | 191 | struct take_cpu_down_param { |
192 | struct task_struct *caller; | ||
193 | unsigned long mod; | 192 | unsigned long mod; |
194 | void *hcpu; | 193 | void *hcpu; |
195 | }; | 194 | }; |
@@ -208,11 +207,6 @@ static int __ref take_cpu_down(void *_param) | |||
208 | 207 | ||
209 | cpu_notify(CPU_DYING | param->mod, param->hcpu); | 208 | cpu_notify(CPU_DYING | param->mod, param->hcpu); |
210 | 209 | ||
211 | if (task_cpu(param->caller) == cpu) | ||
212 | move_task_off_dead_cpu(cpu, param->caller); | ||
213 | /* Force idle task to run as soon as we yield: it should | ||
214 | immediately notice cpu is offline and die quickly. */ | ||
215 | sched_idle_next(); | ||
216 | return 0; | 210 | return 0; |
217 | } | 211 | } |
218 | 212 | ||
@@ -223,7 +217,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
223 | void *hcpu = (void *)(long)cpu; | 217 | void *hcpu = (void *)(long)cpu; |
224 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; | 218 | unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; |
225 | struct take_cpu_down_param tcd_param = { | 219 | struct take_cpu_down_param tcd_param = { |
226 | .caller = current, | ||
227 | .mod = mod, | 220 | .mod = mod, |
228 | .hcpu = hcpu, | 221 | .hcpu = hcpu, |
229 | }; | 222 | }; |
@@ -253,9 +246,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
253 | } | 246 | } |
254 | BUG_ON(cpu_online(cpu)); | 247 | BUG_ON(cpu_online(cpu)); |
255 | 248 | ||
256 | /* Wait for it to sleep (leaving idle task). */ | 249 | /* |
257 | while (!idle_cpu(cpu)) | 250 | * The migration_call() CPU_DYING callback will have removed all |
258 | yield(); | 251 | * runnable tasks from the cpu, there's only the idle task left now |
252 | * that the migration thread is done doing the stop_machine thing. | ||
253 | */ | ||
254 | BUG_ON(!idle_cpu(cpu)); | ||
259 | 255 | ||
260 | /* This actually kills the CPU. */ | 256 | /* This actually kills the CPU. */ |
261 | __cpu_die(cpu); | 257 | __cpu_die(cpu); |
diff --git a/kernel/sched.c b/kernel/sched.c index 41f18695b730..b0d5f1b24a39 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -2366,18 +2366,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2366 | return dest_cpu; | 2366 | return dest_cpu; |
2367 | 2367 | ||
2368 | /* No more Mr. Nice Guy. */ | 2368 | /* No more Mr. Nice Guy. */ |
2369 | if (unlikely(dest_cpu >= nr_cpu_ids)) { | 2369 | dest_cpu = cpuset_cpus_allowed_fallback(p); |
2370 | dest_cpu = cpuset_cpus_allowed_fallback(p); | 2370 | /* |
2371 | /* | 2371 | * Don't tell them about moving exiting tasks or |
2372 | * Don't tell them about moving exiting tasks or | 2372 | * kernel threads (both mm NULL), since they never |
2373 | * kernel threads (both mm NULL), since they never | 2373 | * leave kernel. |
2374 | * leave kernel. | 2374 | */ |
2375 | */ | 2375 | if (p->mm && printk_ratelimit()) { |
2376 | if (p->mm && printk_ratelimit()) { | 2376 | printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", |
2377 | printk(KERN_INFO "process %d (%s) no " | 2377 | task_pid_nr(p), p->comm, cpu); |
2378 | "longer affine to cpu%d\n", | ||
2379 | task_pid_nr(p), p->comm, cpu); | ||
2380 | } | ||
2381 | } | 2378 | } |
2382 | 2379 | ||
2383 | return dest_cpu; | 2380 | return dest_cpu; |
@@ -5712,29 +5709,20 @@ static int migration_cpu_stop(void *data) | |||
5712 | } | 5709 | } |
5713 | 5710 | ||
5714 | #ifdef CONFIG_HOTPLUG_CPU | 5711 | #ifdef CONFIG_HOTPLUG_CPU |
5712 | |||
5715 | /* | 5713 | /* |
5716 | * Figure out where task on dead CPU should go, use force if necessary. | 5714 | * Ensures that the idle task is using init_mm right before its cpu goes |
5715 | * offline. | ||
5717 | */ | 5716 | */ |
5718 | void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 5717 | void idle_task_exit(void) |
5719 | { | 5718 | { |
5720 | struct rq *rq = cpu_rq(dead_cpu); | 5719 | struct mm_struct *mm = current->active_mm; |
5721 | int needs_cpu, uninitialized_var(dest_cpu); | ||
5722 | unsigned long flags; | ||
5723 | 5720 | ||
5724 | local_irq_save(flags); | 5721 | BUG_ON(cpu_online(smp_processor_id())); |
5725 | 5722 | ||
5726 | raw_spin_lock(&rq->lock); | 5723 | if (mm != &init_mm) |
5727 | needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); | 5724 | switch_mm(mm, &init_mm, current); |
5728 | if (needs_cpu) | 5725 | mmdrop(mm); |
5729 | dest_cpu = select_fallback_rq(dead_cpu, p); | ||
5730 | raw_spin_unlock(&rq->lock); | ||
5731 | /* | ||
5732 | * It can only fail if we race with set_cpus_allowed(), | ||
5733 | * in the racer should migrate the task anyway. | ||
5734 | */ | ||
5735 | if (needs_cpu) | ||
5736 | __migrate_task(p, dead_cpu, dest_cpu); | ||
5737 | local_irq_restore(flags); | ||
5738 | } | 5726 | } |
5739 | 5727 | ||
5740 | /* | 5728 | /* |
@@ -5747,128 +5735,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5747 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 5735 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
5748 | { | 5736 | { |
5749 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); | 5737 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |
5750 | unsigned long flags; | ||
5751 | 5738 | ||
5752 | local_irq_save(flags); | ||
5753 | double_rq_lock(rq_src, rq_dest); | ||
5754 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 5739 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
5755 | rq_src->nr_uninterruptible = 0; | 5740 | rq_src->nr_uninterruptible = 0; |
5756 | double_rq_unlock(rq_src, rq_dest); | ||
5757 | local_irq_restore(flags); | ||
5758 | } | ||
5759 | |||
5760 | /* Run through task list and migrate tasks from the dead cpu. */ | ||
5761 | static void migrate_live_tasks(int src_cpu) | ||
5762 | { | ||
5763 | struct task_struct *p, *t; | ||
5764 | |||
5765 | read_lock(&tasklist_lock); | ||
5766 | |||
5767 | do_each_thread(t, p) { | ||
5768 | if (p == current) | ||
5769 | continue; | ||
5770 | |||
5771 | if (task_cpu(p) == src_cpu) | ||
5772 | move_task_off_dead_cpu(src_cpu, p); | ||
5773 | } while_each_thread(t, p); | ||
5774 | |||
5775 | read_unlock(&tasklist_lock); | ||
5776 | } | 5741 | } |
5777 | 5742 | ||
5778 | /* | 5743 | /* |
5779 | * Schedules idle task to be the next runnable task on current CPU. | 5744 | * remove the tasks which were accounted by rq from calc_load_tasks. |
5780 | * It does so by boosting its priority to highest possible. | ||
5781 | * Used by CPU offline code. | ||
5782 | */ | 5745 | */ |
5783 | void sched_idle_next(void) | 5746 | static void calc_global_load_remove(struct rq *rq) |
5784 | { | 5747 | { |
5785 | int this_cpu = smp_processor_id(); | 5748 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); |
5786 | struct rq *rq = cpu_rq(this_cpu); | 5749 | rq->calc_load_active = 0; |
5787 | struct task_struct *p = rq->idle; | ||
5788 | unsigned long flags; | ||
5789 | |||
5790 | /* cpu has to be offline */ | ||
5791 | BUG_ON(cpu_online(this_cpu)); | ||
5792 | |||
5793 | /* | ||
5794 | * Strictly not necessary since rest of the CPUs are stopped by now | ||
5795 | * and interrupts disabled on the current cpu. | ||
5796 | */ | ||
5797 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5798 | |||
5799 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | ||
5800 | |||
5801 | activate_task(rq, p, 0); | ||
5802 | |||
5803 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5804 | } | 5750 | } |
5805 | 5751 | ||
5806 | /* | 5752 | /* |
5807 | * Ensures that the idle task is using init_mm right before its cpu goes | 5753 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
5808 | * offline. | 5754 | * try_to_wake_up()->select_task_rq(). |
5755 | * | ||
5756 | * Called with rq->lock held even though we'er in stop_machine() and | ||
5757 | * there's no concurrency possible, we hold the required locks anyway | ||
5758 | * because of lock validation efforts. | ||
5809 | */ | 5759 | */ |
5810 | void idle_task_exit(void) | 5760 | static void migrate_tasks(unsigned int dead_cpu) |
5811 | { | ||
5812 | struct mm_struct *mm = current->active_mm; | ||
5813 | |||
5814 | BUG_ON(cpu_online(smp_processor_id())); | ||
5815 | |||
5816 | if (mm != &init_mm) | ||
5817 | switch_mm(mm, &init_mm, current); | ||
5818 | mmdrop(mm); | ||
5819 | } | ||
5820 | |||
5821 | /* called under rq->lock with disabled interrupts */ | ||
5822 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | ||
5823 | { | 5761 | { |
5824 | struct rq *rq = cpu_rq(dead_cpu); | 5762 | struct rq *rq = cpu_rq(dead_cpu); |
5825 | 5763 | struct task_struct *next, *stop = rq->stop; | |
5826 | /* Must be exiting, otherwise would be on tasklist. */ | 5764 | int dest_cpu; |
5827 | BUG_ON(!p->exit_state); | ||
5828 | |||
5829 | /* Cannot have done final schedule yet: would have vanished. */ | ||
5830 | BUG_ON(p->state == TASK_DEAD); | ||
5831 | |||
5832 | get_task_struct(p); | ||
5833 | 5765 | ||
5834 | /* | 5766 | /* |
5835 | * Drop lock around migration; if someone else moves it, | 5767 | * Fudge the rq selection such that the below task selection loop |
5836 | * that's OK. No task can be added to this CPU, so iteration is | 5768 | * doesn't get stuck on the currently eligible stop task. |
5837 | * fine. | 5769 | * |
5770 | * We're currently inside stop_machine() and the rq is either stuck | ||
5771 | * in the stop_machine_cpu_stop() loop, or we're executing this code, | ||
5772 | * either way we should never end up calling schedule() until we're | ||
5773 | * done here. | ||
5838 | */ | 5774 | */ |
5839 | raw_spin_unlock_irq(&rq->lock); | 5775 | rq->stop = NULL; |
5840 | move_task_off_dead_cpu(dead_cpu, p); | ||
5841 | raw_spin_lock_irq(&rq->lock); | ||
5842 | |||
5843 | put_task_struct(p); | ||
5844 | } | ||
5845 | |||
5846 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | ||
5847 | static void migrate_dead_tasks(unsigned int dead_cpu) | ||
5848 | { | ||
5849 | struct rq *rq = cpu_rq(dead_cpu); | ||
5850 | struct task_struct *next; | ||
5851 | 5776 | ||
5852 | for ( ; ; ) { | 5777 | for ( ; ; ) { |
5853 | if (!rq->nr_running) | 5778 | /* |
5779 | * There's this thread running, bail when that's the only | ||
5780 | * remaining thread. | ||
5781 | */ | ||
5782 | if (rq->nr_running == 1) | ||
5854 | break; | 5783 | break; |
5784 | |||
5855 | next = pick_next_task(rq); | 5785 | next = pick_next_task(rq); |
5856 | if (!next) | 5786 | BUG_ON(!next); |
5857 | break; | ||
5858 | next->sched_class->put_prev_task(rq, next); | 5787 | next->sched_class->put_prev_task(rq, next); |
5859 | migrate_dead(dead_cpu, next); | ||
5860 | 5788 | ||
5789 | /* Find suitable destination for @next, with force if needed. */ | ||
5790 | dest_cpu = select_fallback_rq(dead_cpu, next); | ||
5791 | raw_spin_unlock(&rq->lock); | ||
5792 | |||
5793 | __migrate_task(next, dead_cpu, dest_cpu); | ||
5794 | |||
5795 | raw_spin_lock(&rq->lock); | ||
5861 | } | 5796 | } |
5862 | } | ||
5863 | 5797 | ||
5864 | /* | 5798 | rq->stop = stop; |
5865 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
5866 | */ | ||
5867 | static void calc_global_load_remove(struct rq *rq) | ||
5868 | { | ||
5869 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
5870 | rq->calc_load_active = 0; | ||
5871 | } | 5799 | } |
5800 | |||
5872 | #endif /* CONFIG_HOTPLUG_CPU */ | 5801 | #endif /* CONFIG_HOTPLUG_CPU */ |
5873 | 5802 | ||
5874 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 5803 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
@@ -6078,15 +6007,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6078 | unsigned long flags; | 6007 | unsigned long flags; |
6079 | struct rq *rq = cpu_rq(cpu); | 6008 | struct rq *rq = cpu_rq(cpu); |
6080 | 6009 | ||
6081 | switch (action) { | 6010 | switch (action & ~CPU_TASKS_FROZEN) { |
6082 | 6011 | ||
6083 | case CPU_UP_PREPARE: | 6012 | case CPU_UP_PREPARE: |
6084 | case CPU_UP_PREPARE_FROZEN: | ||
6085 | rq->calc_load_update = calc_load_update; | 6013 | rq->calc_load_update = calc_load_update; |
6086 | break; | 6014 | break; |
6087 | 6015 | ||
6088 | case CPU_ONLINE: | 6016 | case CPU_ONLINE: |
6089 | case CPU_ONLINE_FROZEN: | ||
6090 | /* Update our root-domain */ | 6017 | /* Update our root-domain */ |
6091 | raw_spin_lock_irqsave(&rq->lock, flags); | 6018 | raw_spin_lock_irqsave(&rq->lock, flags); |
6092 | if (rq->rd) { | 6019 | if (rq->rd) { |
@@ -6098,30 +6025,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6098 | break; | 6025 | break; |
6099 | 6026 | ||
6100 | #ifdef CONFIG_HOTPLUG_CPU | 6027 | #ifdef CONFIG_HOTPLUG_CPU |
6101 | case CPU_DEAD: | ||
6102 | case CPU_DEAD_FROZEN: | ||
6103 | migrate_live_tasks(cpu); | ||
6104 | /* Idle task back to normal (off runqueue, low prio) */ | ||
6105 | raw_spin_lock_irq(&rq->lock); | ||
6106 | deactivate_task(rq, rq->idle, 0); | ||
6107 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | ||
6108 | rq->idle->sched_class = &idle_sched_class; | ||
6109 | migrate_dead_tasks(cpu); | ||
6110 | raw_spin_unlock_irq(&rq->lock); | ||
6111 | migrate_nr_uninterruptible(rq); | ||
6112 | BUG_ON(rq->nr_running != 0); | ||
6113 | calc_global_load_remove(rq); | ||
6114 | break; | ||
6115 | |||
6116 | case CPU_DYING: | 6028 | case CPU_DYING: |
6117 | case CPU_DYING_FROZEN: | ||
6118 | /* Update our root-domain */ | 6029 | /* Update our root-domain */ |
6119 | raw_spin_lock_irqsave(&rq->lock, flags); | 6030 | raw_spin_lock_irqsave(&rq->lock, flags); |
6120 | if (rq->rd) { | 6031 | if (rq->rd) { |
6121 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 6032 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
6122 | set_rq_offline(rq); | 6033 | set_rq_offline(rq); |
6123 | } | 6034 | } |
6035 | migrate_tasks(cpu); | ||
6036 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | ||
6124 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6037 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6038 | |||
6039 | migrate_nr_uninterruptible(rq); | ||
6040 | calc_global_load_remove(rq); | ||
6125 | break; | 6041 | break; |
6126 | #endif | 6042 | #endif |
6127 | } | 6043 | } |