aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/kernel/smpboot.c
diff options
context:
space:
mode:
authorAshok Raj <ashok.raj@intel.com>2005-06-25 17:55:00 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-25 19:24:30 -0400
commit76e4f660d9f4c6d1bb473f72be2988c35eaca948 (patch)
tree3a0b3ae0cc12d6f0d96bf1f3b564f8470e0f72ae /arch/x86_64/kernel/smpboot.c
parente6982c671c560da4a0bc5c908cbcbec12bd5991d (diff)
[PATCH] x86_64: CPU hotplug support
Experimental CPU hotplug patch for x86_64 ----------------------------------------- This supports logical CPU online and offline. - Test with maxcpus=1, and then kick other cpu's off to test if init code is all cleaned up. CONFIG_SCHED_SMT works as well. - idle threads are forked on demand from keventd threads for clean startup TBD: 1. Not tested on a real NUMA machine (tested with numa=fake=2) 2. Handle ACPI pieces for physical hotplug support. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Andi Kleen <ak@muc.de> Acked-by: Zwane Mwaikambo <zwane@arm.linux.org.uk> Signed-off-by: Shaohua.li<shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64/kernel/smpboot.c')
-rw-r--r--arch/x86_64/kernel/smpboot.c205
1 files changed, 190 insertions, 15 deletions
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index bc98a6722cba..5a3f955b6576 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -34,6 +34,7 @@
34 * Andi Kleen : Converted to new state machine. 34 * Andi Kleen : Converted to new state machine.
35 * Various cleanups. 35 * Various cleanups.
36 * Probably mostly hotplug CPU ready now. 36 * Probably mostly hotplug CPU ready now.
37 * Ashok Raj : CPU hotplug support
37 */ 38 */
38 39
39 40
@@ -98,6 +99,37 @@ EXPORT_SYMBOL(cpu_core_map);
98extern unsigned char trampoline_data[]; 99extern unsigned char trampoline_data[];
99extern unsigned char trampoline_end[]; 100extern unsigned char trampoline_end[];
100 101
102/* State of each CPU */
103DEFINE_PER_CPU(int, cpu_state) = { 0 };
104
105/*
106 * Store all idle threads, this can be reused instead of creating
107 * a new thread. Also avoids complicated thread destroy functionality
108 * for idle threads.
109 */
110struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
111
112#define get_idle_for_cpu(x) (idle_thread_array[(x)])
113#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
114
115/*
116 * cpu_possible_map should be static, it cannot change as cpu's
117 * are onlined, or offlined. The reason is per-cpu data-structures
118 * are allocated by some modules at init time, and dont expect to
119 * do this dynamically on cpu arrival/departure.
120 * cpu_present_map on the other hand can change dynamically.
121 * In case when cpu_hotplug is not compiled, then we resort to current
122 * behaviour, which is cpu_possible == cpu_present.
123 * If cpu-hotplug is supported, then we need to preallocate for all
124 * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
125 * - Ashok Raj
126 */
127#ifdef CONFIG_HOTPLUG_CPU
128#define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map)
129#else
130#define fixup_cpu_possible_map(x)
131#endif
132
101/* 133/*
102 * Currently trivial. Write the real->protected mode 134 * Currently trivial. Write the real->protected mode
103 * bootstrap into the page concerned. The caller 135 * bootstrap into the page concerned. The caller
@@ -623,33 +655,77 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
623 return (send_status | accept_status); 655 return (send_status | accept_status);
624} 656}
625 657
658struct create_idle {
659 struct task_struct *idle;
660 struct completion done;
661 int cpu;
662};
663
664void do_fork_idle(void *_c_idle)
665{
666 struct create_idle *c_idle = _c_idle;
667
668 c_idle->idle = fork_idle(c_idle->cpu);
669 complete(&c_idle->done);
670}
671
626/* 672/*
627 * Boot one CPU. 673 * Boot one CPU.
628 */ 674 */
629static int __cpuinit do_boot_cpu(int cpu, int apicid) 675static int __cpuinit do_boot_cpu(int cpu, int apicid)
630{ 676{
631 struct task_struct *idle;
632 unsigned long boot_error; 677 unsigned long boot_error;
633 int timeout; 678 int timeout;
634 unsigned long start_rip; 679 unsigned long start_rip;
680 struct create_idle c_idle = {
681 .cpu = cpu,
682 .done = COMPLETION_INITIALIZER(c_idle.done),
683 };
684 DECLARE_WORK(work, do_fork_idle, &c_idle);
685
686 c_idle.idle = get_idle_for_cpu(cpu);
687
688 if (c_idle.idle) {
689 c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
690 (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1);
691 init_idle(c_idle.idle, cpu);
692 goto do_rest;
693 }
694
635 /* 695 /*
636 * We can't use kernel_thread since we must avoid to 696 * During cold boot process, keventd thread is not spun up yet.
637 * reschedule the child. 697 * When we do cpu hot-add, we create idle threads on the fly, we should
698 * not acquire any attributes from the calling context. Hence the clean
699 * way to create kernel_threads() is to do that from keventd().
700 * We do the current_is_keventd() due to the fact that ACPI notifier
701 * was also queuing to keventd() and when the caller is already running
702 * in context of keventd(), we would end up with locking up the keventd
703 * thread.
638 */ 704 */
639 idle = fork_idle(cpu); 705 if (!keventd_up() || current_is_keventd())
640 if (IS_ERR(idle)) { 706 work.func(work.data);
707 else {
708 schedule_work(&work);
709 wait_for_completion(&c_idle.done);
710 }
711
712 if (IS_ERR(c_idle.idle)) {
641 printk("failed fork for CPU %d\n", cpu); 713 printk("failed fork for CPU %d\n", cpu);
642 return PTR_ERR(idle); 714 return PTR_ERR(c_idle.idle);
643 } 715 }
644 716
645 cpu_pda[cpu].pcurrent = idle; 717 set_idle_for_cpu(cpu, c_idle.idle);
718
719do_rest:
720
721 cpu_pda[cpu].pcurrent = c_idle.idle;
646 722
647 start_rip = setup_trampoline(); 723 start_rip = setup_trampoline();
648 724
649 init_rsp = idle->thread.rsp; 725 init_rsp = c_idle.idle->thread.rsp;
650 per_cpu(init_tss,cpu).rsp0 = init_rsp; 726 per_cpu(init_tss,cpu).rsp0 = init_rsp;
651 initial_code = start_secondary; 727 initial_code = start_secondary;
652 clear_ti_thread_flag(idle->thread_info, TIF_FORK); 728 clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
653 729
654 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, 730 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
655 start_rip, init_rsp); 731 start_rip, init_rsp);
@@ -925,10 +1001,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
925 int apicid = cpu_present_to_apicid(i); 1001 int apicid = cpu_present_to_apicid(i);
926 if (physid_isset(apicid, phys_cpu_present_map)) { 1002 if (physid_isset(apicid, phys_cpu_present_map)) {
927 cpu_set(i, cpu_present_map); 1003 cpu_set(i, cpu_present_map);
928 /* possible map would be different if we supported real
929 CPU hotplug. */
930 cpu_set(i, cpu_possible_map); 1004 cpu_set(i, cpu_possible_map);
931 } 1005 }
1006 fixup_cpu_possible_map(i);
932 } 1007 }
933 1008
934 if (smp_sanity_check(max_cpus) < 0) { 1009 if (smp_sanity_check(max_cpus) < 0) {
@@ -977,9 +1052,6 @@ void __init smp_prepare_boot_cpu(void)
977 1052
978/* 1053/*
979 * Entry point to boot a CPU. 1054 * Entry point to boot a CPU.
980 *
981 * This is all __cpuinit, not __devinit for now because we don't support
982 * CPU hotplug (yet).
983 */ 1055 */
984int __cpuinit __cpu_up(unsigned int cpu) 1056int __cpuinit __cpu_up(unsigned int cpu)
985{ 1057{
@@ -996,6 +1068,14 @@ int __cpuinit __cpu_up(unsigned int cpu)
996 return -EINVAL; 1068 return -EINVAL;
997 } 1069 }
998 1070
1071 /*
1072 * Already booted CPU?
1073 */
1074 if (cpu_isset(cpu, cpu_callin_map)) {
1075 Dprintk("do_boot_cpu %d Already started\n", cpu);
1076 return -ENOSYS;
1077 }
1078
999 /* Boot it! */ 1079 /* Boot it! */
1000 err = do_boot_cpu(cpu, apicid); 1080 err = do_boot_cpu(cpu, apicid);
1001 if (err < 0) { 1081 if (err < 0) {
@@ -1008,7 +1088,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
1008 1088
1009 while (!cpu_isset(cpu, cpu_online_map)) 1089 while (!cpu_isset(cpu, cpu_online_map))
1010 cpu_relax(); 1090 cpu_relax();
1011 return 0; 1091 err = 0;
1092
1093 return err;
1012} 1094}
1013 1095
1014/* 1096/*
@@ -1016,7 +1098,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
1016 */ 1098 */
1017void __init smp_cpus_done(unsigned int max_cpus) 1099void __init smp_cpus_done(unsigned int max_cpus)
1018{ 1100{
1101#ifndef CONFIG_HOTPLUG_CPU
1019 zap_low_mappings(); 1102 zap_low_mappings();
1103#endif
1020 smp_cleanup_boot(); 1104 smp_cleanup_boot();
1021 1105
1022#ifdef CONFIG_X86_IO_APIC 1106#ifdef CONFIG_X86_IO_APIC
@@ -1028,3 +1112,94 @@ void __init smp_cpus_done(unsigned int max_cpus)
1028 1112
1029 check_nmi_watchdog(); 1113 check_nmi_watchdog();
1030} 1114}
1115
1116#ifdef CONFIG_HOTPLUG_CPU
1117
1118static void
1119remove_siblinginfo(int cpu)
1120{
1121 int sibling;
1122
1123 for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
1124 cpu_clear(cpu, cpu_sibling_map[sibling]);
1125 for_each_cpu_mask(sibling, cpu_core_map[cpu])
1126 cpu_clear(cpu, cpu_core_map[sibling]);
1127 cpus_clear(cpu_sibling_map[cpu]);
1128 cpus_clear(cpu_core_map[cpu]);
1129 phys_proc_id[cpu] = BAD_APICID;
1130 cpu_core_id[cpu] = BAD_APICID;
1131}
1132
1133void remove_cpu_from_maps(void)
1134{
1135 int cpu = smp_processor_id();
1136
1137 cpu_clear(cpu, cpu_callout_map);
1138 cpu_clear(cpu, cpu_callin_map);
1139 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
1140}
1141
1142int __cpu_disable(void)
1143{
1144 int cpu = smp_processor_id();
1145
1146 /*
1147 * Perhaps use cpufreq to drop frequency, but that could go
1148 * into generic code.
1149 *
1150 * We won't take down the boot processor on i386 due to some
1151 * interrupts only being able to be serviced by the BSP.
1152 * Especially so if we're not using an IOAPIC -zwane
1153 */
1154 if (cpu == 0)
1155 return -EBUSY;
1156
1157 disable_APIC_timer();
1158
1159 /*
1160 * HACK:
1161 * Allow any queued timer interrupts to get serviced
1162 * This is only a temporary solution until we cleanup
1163 * fixup_irqs as we do for IA64.
1164 */
1165 local_irq_enable();
1166 mdelay(1);
1167
1168 local_irq_disable();
1169 remove_siblinginfo(cpu);
1170
1171 /* It's now safe to remove this processor from the online map */
1172 cpu_clear(cpu, cpu_online_map);
1173 remove_cpu_from_maps();
1174 fixup_irqs(cpu_online_map);
1175 return 0;
1176}
1177
1178void __cpu_die(unsigned int cpu)
1179{
1180 /* We don't do anything here: idle task is faking death itself. */
1181 unsigned int i;
1182
1183 for (i = 0; i < 10; i++) {
1184 /* They ack this in play_dead by setting CPU_DEAD */
1185 if (per_cpu(cpu_state, cpu) == CPU_DEAD)
1186 return;
1187 current->state = TASK_UNINTERRUPTIBLE;
1188 schedule_timeout(HZ/10);
1189 }
1190 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1191}
1192
1193#else /* ... !CONFIG_HOTPLUG_CPU */
1194
1195int __cpu_disable(void)
1196{
1197 return -ENOSYS;
1198}
1199
1200void __cpu_die(unsigned int cpu)
1201{
1202 /* We said "no" in __cpu_disable */
1203 BUG();
1204}
1205#endif /* CONFIG_HOTPLUG_CPU */