diff options
author | Ashok Raj <ashok.raj@intel.com> | 2005-06-25 17:55:00 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-06-25 19:24:30 -0400 |
commit | 76e4f660d9f4c6d1bb473f72be2988c35eaca948 (patch) | |
tree | 3a0b3ae0cc12d6f0d96bf1f3b564f8470e0f72ae /arch/x86_64/kernel/smpboot.c | |
parent | e6982c671c560da4a0bc5c908cbcbec12bd5991d (diff) |
[PATCH] x86_64: CPU hotplug support
Experimental CPU hotplug patch for x86_64
-----------------------------------------
This supports logical CPU online and offline.
- Test with maxcpus=1, and then kick other cpu's off to test if init code
is all cleaned up. CONFIG_SCHED_SMT works as well.
- idle threads are forked on demand from keventd threads for clean startup
TBD:
1. Not tested on a real NUMA machine (tested with numa=fake=2)
2. Handle ACPI pieces for physical hotplug support.
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Andi Kleen <ak@muc.de>
Acked-by: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Signed-off-by: Shaohua.li<shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64/kernel/smpboot.c')
-rw-r--r-- | arch/x86_64/kernel/smpboot.c | 205 |
1 files changed, 190 insertions, 15 deletions
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index bc98a6722cba..5a3f955b6576 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c | |||
@@ -34,6 +34,7 @@ | |||
34 | * Andi Kleen : Converted to new state machine. | 34 | * Andi Kleen : Converted to new state machine. |
35 | * Various cleanups. | 35 | * Various cleanups. |
36 | * Probably mostly hotplug CPU ready now. | 36 | * Probably mostly hotplug CPU ready now. |
37 | * Ashok Raj : CPU hotplug support | ||
37 | */ | 38 | */ |
38 | 39 | ||
39 | 40 | ||
@@ -98,6 +99,37 @@ EXPORT_SYMBOL(cpu_core_map); | |||
98 | extern unsigned char trampoline_data[]; | 99 | extern unsigned char trampoline_data[]; |
99 | extern unsigned char trampoline_end[]; | 100 | extern unsigned char trampoline_end[]; |
100 | 101 | ||
102 | /* State of each CPU */ | ||
103 | DEFINE_PER_CPU(int, cpu_state) = { 0 }; | ||
104 | |||
105 | /* | ||
106 | * Store all idle threads, this can be reused instead of creating | ||
107 | * a new thread. Also avoids complicated thread destroy functionality | ||
108 | * for idle threads. | ||
109 | */ | ||
110 | struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; | ||
111 | |||
112 | #define get_idle_for_cpu(x) (idle_thread_array[(x)]) | ||
113 | #define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) | ||
114 | |||
115 | /* | ||
116 | * cpu_possible_map should be static, it cannot change as cpu's | ||
117 | * are onlined, or offlined. The reason is per-cpu data-structures | ||
118 | * are allocated by some modules at init time, and dont expect to | ||
119 | * do this dynamically on cpu arrival/departure. | ||
120 | * cpu_present_map on the other hand can change dynamically. | ||
121 | * In case when cpu_hotplug is not compiled, then we resort to current | ||
122 | * behaviour, which is cpu_possible == cpu_present. | ||
123 | * If cpu-hotplug is supported, then we need to preallocate for all | ||
124 | * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range. | ||
125 | * - Ashok Raj | ||
126 | */ | ||
127 | #ifdef CONFIG_HOTPLUG_CPU | ||
128 | #define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map) | ||
129 | #else | ||
130 | #define fixup_cpu_possible_map(x) | ||
131 | #endif | ||
132 | |||
101 | /* | 133 | /* |
102 | * Currently trivial. Write the real->protected mode | 134 | * Currently trivial. Write the real->protected mode |
103 | * bootstrap into the page concerned. The caller | 135 | * bootstrap into the page concerned. The caller |
@@ -623,33 +655,77 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta | |||
623 | return (send_status | accept_status); | 655 | return (send_status | accept_status); |
624 | } | 656 | } |
625 | 657 | ||
658 | struct create_idle { | ||
659 | struct task_struct *idle; | ||
660 | struct completion done; | ||
661 | int cpu; | ||
662 | }; | ||
663 | |||
664 | void do_fork_idle(void *_c_idle) | ||
665 | { | ||
666 | struct create_idle *c_idle = _c_idle; | ||
667 | |||
668 | c_idle->idle = fork_idle(c_idle->cpu); | ||
669 | complete(&c_idle->done); | ||
670 | } | ||
671 | |||
626 | /* | 672 | /* |
627 | * Boot one CPU. | 673 | * Boot one CPU. |
628 | */ | 674 | */ |
629 | static int __cpuinit do_boot_cpu(int cpu, int apicid) | 675 | static int __cpuinit do_boot_cpu(int cpu, int apicid) |
630 | { | 676 | { |
631 | struct task_struct *idle; | ||
632 | unsigned long boot_error; | 677 | unsigned long boot_error; |
633 | int timeout; | 678 | int timeout; |
634 | unsigned long start_rip; | 679 | unsigned long start_rip; |
680 | struct create_idle c_idle = { | ||
681 | .cpu = cpu, | ||
682 | .done = COMPLETION_INITIALIZER(c_idle.done), | ||
683 | }; | ||
684 | DECLARE_WORK(work, do_fork_idle, &c_idle); | ||
685 | |||
686 | c_idle.idle = get_idle_for_cpu(cpu); | ||
687 | |||
688 | if (c_idle.idle) { | ||
689 | c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) | ||
690 | (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1); | ||
691 | init_idle(c_idle.idle, cpu); | ||
692 | goto do_rest; | ||
693 | } | ||
694 | |||
635 | /* | 695 | /* |
636 | * We can't use kernel_thread since we must avoid to | 696 | * During cold boot process, keventd thread is not spun up yet. |
637 | * reschedule the child. | 697 | * When we do cpu hot-add, we create idle threads on the fly, we should |
698 | * not acquire any attributes from the calling context. Hence the clean | ||
699 | * way to create kernel_threads() is to do that from keventd(). | ||
700 | * We do the current_is_keventd() due to the fact that ACPI notifier | ||
701 | * was also queuing to keventd() and when the caller is already running | ||
702 | * in context of keventd(), we would end up with locking up the keventd | ||
703 | * thread. | ||
638 | */ | 704 | */ |
639 | idle = fork_idle(cpu); | 705 | if (!keventd_up() || current_is_keventd()) |
640 | if (IS_ERR(idle)) { | 706 | work.func(work.data); |
707 | else { | ||
708 | schedule_work(&work); | ||
709 | wait_for_completion(&c_idle.done); | ||
710 | } | ||
711 | |||
712 | if (IS_ERR(c_idle.idle)) { | ||
641 | printk("failed fork for CPU %d\n", cpu); | 713 | printk("failed fork for CPU %d\n", cpu); |
642 | return PTR_ERR(idle); | 714 | return PTR_ERR(c_idle.idle); |
643 | } | 715 | } |
644 | 716 | ||
645 | cpu_pda[cpu].pcurrent = idle; | 717 | set_idle_for_cpu(cpu, c_idle.idle); |
718 | |||
719 | do_rest: | ||
720 | |||
721 | cpu_pda[cpu].pcurrent = c_idle.idle; | ||
646 | 722 | ||
647 | start_rip = setup_trampoline(); | 723 | start_rip = setup_trampoline(); |
648 | 724 | ||
649 | init_rsp = idle->thread.rsp; | 725 | init_rsp = c_idle.idle->thread.rsp; |
650 | per_cpu(init_tss,cpu).rsp0 = init_rsp; | 726 | per_cpu(init_tss,cpu).rsp0 = init_rsp; |
651 | initial_code = start_secondary; | 727 | initial_code = start_secondary; |
652 | clear_ti_thread_flag(idle->thread_info, TIF_FORK); | 728 | clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK); |
653 | 729 | ||
654 | printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, | 730 | printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, |
655 | start_rip, init_rsp); | 731 | start_rip, init_rsp); |
@@ -925,10 +1001,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus) | |||
925 | int apicid = cpu_present_to_apicid(i); | 1001 | int apicid = cpu_present_to_apicid(i); |
926 | if (physid_isset(apicid, phys_cpu_present_map)) { | 1002 | if (physid_isset(apicid, phys_cpu_present_map)) { |
927 | cpu_set(i, cpu_present_map); | 1003 | cpu_set(i, cpu_present_map); |
928 | /* possible map would be different if we supported real | ||
929 | CPU hotplug. */ | ||
930 | cpu_set(i, cpu_possible_map); | 1004 | cpu_set(i, cpu_possible_map); |
931 | } | 1005 | } |
1006 | fixup_cpu_possible_map(i); | ||
932 | } | 1007 | } |
933 | 1008 | ||
934 | if (smp_sanity_check(max_cpus) < 0) { | 1009 | if (smp_sanity_check(max_cpus) < 0) { |
@@ -977,9 +1052,6 @@ void __init smp_prepare_boot_cpu(void) | |||
977 | 1052 | ||
978 | /* | 1053 | /* |
979 | * Entry point to boot a CPU. | 1054 | * Entry point to boot a CPU. |
980 | * | ||
981 | * This is all __cpuinit, not __devinit for now because we don't support | ||
982 | * CPU hotplug (yet). | ||
983 | */ | 1055 | */ |
984 | int __cpuinit __cpu_up(unsigned int cpu) | 1056 | int __cpuinit __cpu_up(unsigned int cpu) |
985 | { | 1057 | { |
@@ -996,6 +1068,14 @@ int __cpuinit __cpu_up(unsigned int cpu) | |||
996 | return -EINVAL; | 1068 | return -EINVAL; |
997 | } | 1069 | } |
998 | 1070 | ||
1071 | /* | ||
1072 | * Already booted CPU? | ||
1073 | */ | ||
1074 | if (cpu_isset(cpu, cpu_callin_map)) { | ||
1075 | Dprintk("do_boot_cpu %d Already started\n", cpu); | ||
1076 | return -ENOSYS; | ||
1077 | } | ||
1078 | |||
999 | /* Boot it! */ | 1079 | /* Boot it! */ |
1000 | err = do_boot_cpu(cpu, apicid); | 1080 | err = do_boot_cpu(cpu, apicid); |
1001 | if (err < 0) { | 1081 | if (err < 0) { |
@@ -1008,7 +1088,9 @@ int __cpuinit __cpu_up(unsigned int cpu) | |||
1008 | 1088 | ||
1009 | while (!cpu_isset(cpu, cpu_online_map)) | 1089 | while (!cpu_isset(cpu, cpu_online_map)) |
1010 | cpu_relax(); | 1090 | cpu_relax(); |
1011 | return 0; | 1091 | err = 0; |
1092 | |||
1093 | return err; | ||
1012 | } | 1094 | } |
1013 | 1095 | ||
1014 | /* | 1096 | /* |
@@ -1016,7 +1098,9 @@ int __cpuinit __cpu_up(unsigned int cpu) | |||
1016 | */ | 1098 | */ |
1017 | void __init smp_cpus_done(unsigned int max_cpus) | 1099 | void __init smp_cpus_done(unsigned int max_cpus) |
1018 | { | 1100 | { |
1101 | #ifndef CONFIG_HOTPLUG_CPU | ||
1019 | zap_low_mappings(); | 1102 | zap_low_mappings(); |
1103 | #endif | ||
1020 | smp_cleanup_boot(); | 1104 | smp_cleanup_boot(); |
1021 | 1105 | ||
1022 | #ifdef CONFIG_X86_IO_APIC | 1106 | #ifdef CONFIG_X86_IO_APIC |
@@ -1028,3 +1112,94 @@ void __init smp_cpus_done(unsigned int max_cpus) | |||
1028 | 1112 | ||
1029 | check_nmi_watchdog(); | 1113 | check_nmi_watchdog(); |
1030 | } | 1114 | } |
1115 | |||
1116 | #ifdef CONFIG_HOTPLUG_CPU | ||
1117 | |||
1118 | static void | ||
1119 | remove_siblinginfo(int cpu) | ||
1120 | { | ||
1121 | int sibling; | ||
1122 | |||
1123 | for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) | ||
1124 | cpu_clear(cpu, cpu_sibling_map[sibling]); | ||
1125 | for_each_cpu_mask(sibling, cpu_core_map[cpu]) | ||
1126 | cpu_clear(cpu, cpu_core_map[sibling]); | ||
1127 | cpus_clear(cpu_sibling_map[cpu]); | ||
1128 | cpus_clear(cpu_core_map[cpu]); | ||
1129 | phys_proc_id[cpu] = BAD_APICID; | ||
1130 | cpu_core_id[cpu] = BAD_APICID; | ||
1131 | } | ||
1132 | |||
1133 | void remove_cpu_from_maps(void) | ||
1134 | { | ||
1135 | int cpu = smp_processor_id(); | ||
1136 | |||
1137 | cpu_clear(cpu, cpu_callout_map); | ||
1138 | cpu_clear(cpu, cpu_callin_map); | ||
1139 | clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ | ||
1140 | } | ||
1141 | |||
1142 | int __cpu_disable(void) | ||
1143 | { | ||
1144 | int cpu = smp_processor_id(); | ||
1145 | |||
1146 | /* | ||
1147 | * Perhaps use cpufreq to drop frequency, but that could go | ||
1148 | * into generic code. | ||
1149 | * | ||
1150 | * We won't take down the boot processor on i386 due to some | ||
1151 | * interrupts only being able to be serviced by the BSP. | ||
1152 | * Especially so if we're not using an IOAPIC -zwane | ||
1153 | */ | ||
1154 | if (cpu == 0) | ||
1155 | return -EBUSY; | ||
1156 | |||
1157 | disable_APIC_timer(); | ||
1158 | |||
1159 | /* | ||
1160 | * HACK: | ||
1161 | * Allow any queued timer interrupts to get serviced | ||
1162 | * This is only a temporary solution until we cleanup | ||
1163 | * fixup_irqs as we do for IA64. | ||
1164 | */ | ||
1165 | local_irq_enable(); | ||
1166 | mdelay(1); | ||
1167 | |||
1168 | local_irq_disable(); | ||
1169 | remove_siblinginfo(cpu); | ||
1170 | |||
1171 | /* It's now safe to remove this processor from the online map */ | ||
1172 | cpu_clear(cpu, cpu_online_map); | ||
1173 | remove_cpu_from_maps(); | ||
1174 | fixup_irqs(cpu_online_map); | ||
1175 | return 0; | ||
1176 | } | ||
1177 | |||
1178 | void __cpu_die(unsigned int cpu) | ||
1179 | { | ||
1180 | /* We don't do anything here: idle task is faking death itself. */ | ||
1181 | unsigned int i; | ||
1182 | |||
1183 | for (i = 0; i < 10; i++) { | ||
1184 | /* They ack this in play_dead by setting CPU_DEAD */ | ||
1185 | if (per_cpu(cpu_state, cpu) == CPU_DEAD) | ||
1186 | return; | ||
1187 | current->state = TASK_UNINTERRUPTIBLE; | ||
1188 | schedule_timeout(HZ/10); | ||
1189 | } | ||
1190 | printk(KERN_ERR "CPU %u didn't die...\n", cpu); | ||
1191 | } | ||
1192 | |||
1193 | #else /* ... !CONFIG_HOTPLUG_CPU */ | ||
1194 | |||
1195 | int __cpu_disable(void) | ||
1196 | { | ||
1197 | return -ENOSYS; | ||
1198 | } | ||
1199 | |||
1200 | void __cpu_die(unsigned int cpu) | ||
1201 | { | ||
1202 | /* We said "no" in __cpu_disable */ | ||
1203 | BUG(); | ||
1204 | } | ||
1205 | #endif /* CONFIG_HOTPLUG_CPU */ | ||