aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
authorAshok Raj <ashok.raj@intel.com>2005-06-25 17:55:00 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-25 19:24:30 -0400
commit76e4f660d9f4c6d1bb473f72be2988c35eaca948 (patch)
tree3a0b3ae0cc12d6f0d96bf1f3b564f8470e0f72ae /arch/x86_64
parente6982c671c560da4a0bc5c908cbcbec12bd5991d (diff)
[PATCH] x86_64: CPU hotplug support
Experimental CPU hotplug patch for x86_64 ----------------------------------------- This supports logical CPU online and offline. - Test with maxcpus=1, and then kick other cpu's off to test if init code is all cleaned up. CONFIG_SCHED_SMT works as well. - idle threads are forked on demand from keventd threads for clean startup TBD: 1. Not tested on a real NUMA machine (tested with numa=fake=2) 2. Handle ACPI pieces for physical hotplug support. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Andi Kleen <ak@muc.de> Acked-by: Zwane Mwaikambo <zwane@arm.linux.org.uk> Signed-off-by: Shaohua.li<shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig9
-rw-r--r--arch/x86_64/kernel/irq.c29
-rw-r--r--arch/x86_64/kernel/process.c29
-rw-r--r--arch/x86_64/kernel/smpboot.c205
-rw-r--r--arch/x86_64/kernel/traps.c8
5 files changed, 263 insertions, 17 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index db259757dc8a..92f5a5266023 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -313,6 +313,15 @@ config NR_CPUS
313 This is purely to save memory - each supported CPU requires 313 This is purely to save memory - each supported CPU requires
314 memory in the static kernel configuration. 314 memory in the static kernel configuration.
315 315
316config HOTPLUG_CPU
317 bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
318 depends on SMP && HOTPLUG && EXPERIMENTAL
319 help
320 Say Y here to experiment with turning CPUs off and on. CPUs
321 can be controlled through /sys/devices/system/cpu/cpu#.
322 Say N if you want to disable CPU hotplug.
323
324
316config HPET_TIMER 325config HPET_TIMER
317 bool 326 bool
318 default y 327 default y
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 62b112e4deb4..cc3fb85f5145 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -14,6 +14,7 @@
14#include <linux/interrupt.h> 14#include <linux/interrupt.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/delay.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
18#include <asm/io_apic.h> 19#include <asm/io_apic.h>
19 20
@@ -106,3 +107,31 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
106 return 1; 107 return 1;
107} 108}
108 109
110#ifdef CONFIG_HOTPLUG_CPU
111void fixup_irqs(cpumask_t map)
112{
113 unsigned int irq;
114 static int warned;
115
116 for (irq = 0; irq < NR_IRQS; irq++) {
117 cpumask_t mask;
118 if (irq == 2)
119 continue;
120
121 cpus_and(mask, irq_affinity[irq], map);
122 if (any_online_cpu(mask) == NR_CPUS) {
123 printk("Breaking affinity for irq %i\n", irq);
124 mask = map;
125 }
126 if (irq_desc[irq].handler->set_affinity)
127 irq_desc[irq].handler->set_affinity(irq, mask);
128 else if (irq_desc[irq].action && !(warned++))
129 printk("Cannot set affinity for irq %i\n", irq);
130 }
131
132 /* That doesn't seem sufficient. Give it 1ms. */
133 local_irq_enable();
134 mdelay(1);
135 local_irq_disable();
136}
137#endif
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 15fa35a81a72..1d91271796e5 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -8,7 +8,8 @@
8 * 8 *
9 * X86-64 port 9 * X86-64 port
10 * Andi Kleen. 10 * Andi Kleen.
11 * 11 *
12 * CPU hotplug support - ashok.raj@intel.com
12 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ 13 * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
13 */ 14 */
14 15
@@ -18,6 +19,7 @@
18 19
19#include <stdarg.h> 20#include <stdarg.h>
20 21
22#include <linux/cpu.h>
21#include <linux/errno.h> 23#include <linux/errno.h>
22#include <linux/sched.h> 24#include <linux/sched.h>
23#include <linux/kernel.h> 25#include <linux/kernel.h>
@@ -154,6 +156,29 @@ void cpu_idle_wait(void)
154} 156}
155EXPORT_SYMBOL_GPL(cpu_idle_wait); 157EXPORT_SYMBOL_GPL(cpu_idle_wait);
156 158
159#ifdef CONFIG_HOTPLUG_CPU
160DECLARE_PER_CPU(int, cpu_state);
161
162#include <asm/nmi.h>
163/* We don't actually take CPU down, just spin without interrupts. */
164static inline void play_dead(void)
165{
166 idle_task_exit();
167 wbinvd();
168 mb();
169 /* Ack it */
170 __get_cpu_var(cpu_state) = CPU_DEAD;
171
172 while (1)
173 safe_halt();
174}
175#else
176static inline void play_dead(void)
177{
178 BUG();
179}
180#endif /* CONFIG_HOTPLUG_CPU */
181
157/* 182/*
158 * The idle thread. There's no useful work to be 183 * The idle thread. There's no useful work to be
159 * done, so just try to conserve power and have a 184 * done, so just try to conserve power and have a
@@ -174,6 +199,8 @@ void cpu_idle (void)
174 idle = pm_idle; 199 idle = pm_idle;
175 if (!idle) 200 if (!idle)
176 idle = default_idle; 201 idle = default_idle;
202 if (cpu_is_offline(smp_processor_id()))
203 play_dead();
177 idle(); 204 idle();
178 } 205 }
179 206
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index bc98a6722cba..5a3f955b6576 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -34,6 +34,7 @@
34 * Andi Kleen : Converted to new state machine. 34 * Andi Kleen : Converted to new state machine.
35 * Various cleanups. 35 * Various cleanups.
36 * Probably mostly hotplug CPU ready now. 36 * Probably mostly hotplug CPU ready now.
37 * Ashok Raj : CPU hotplug support
37 */ 38 */
38 39
39 40
@@ -98,6 +99,37 @@ EXPORT_SYMBOL(cpu_core_map);
98extern unsigned char trampoline_data[]; 99extern unsigned char trampoline_data[];
99extern unsigned char trampoline_end[]; 100extern unsigned char trampoline_end[];
100 101
102/* State of each CPU */
103DEFINE_PER_CPU(int, cpu_state) = { 0 };
104
105/*
106 * Store all idle threads, this can be reused instead of creating
107 * a new thread. Also avoids complicated thread destroy functionality
108 * for idle threads.
109 */
110struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
111
112#define get_idle_for_cpu(x) (idle_thread_array[(x)])
113#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
114
115/*
116 * cpu_possible_map should be static, it cannot change as cpu's
117 * are onlined, or offlined. The reason is per-cpu data-structures
118 * are allocated by some modules at init time, and dont expect to
119 * do this dynamically on cpu arrival/departure.
120 * cpu_present_map on the other hand can change dynamically.
121 * In case when cpu_hotplug is not compiled, then we resort to current
122 * behaviour, which is cpu_possible == cpu_present.
123 * If cpu-hotplug is supported, then we need to preallocate for all
124 * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
125 * - Ashok Raj
126 */
127#ifdef CONFIG_HOTPLUG_CPU
128#define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map)
129#else
130#define fixup_cpu_possible_map(x)
131#endif
132
101/* 133/*
102 * Currently trivial. Write the real->protected mode 134 * Currently trivial. Write the real->protected mode
103 * bootstrap into the page concerned. The caller 135 * bootstrap into the page concerned. The caller
@@ -623,33 +655,77 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
623 return (send_status | accept_status); 655 return (send_status | accept_status);
624} 656}
625 657
658struct create_idle {
659 struct task_struct *idle;
660 struct completion done;
661 int cpu;
662};
663
664void do_fork_idle(void *_c_idle)
665{
666 struct create_idle *c_idle = _c_idle;
667
668 c_idle->idle = fork_idle(c_idle->cpu);
669 complete(&c_idle->done);
670}
671
626/* 672/*
627 * Boot one CPU. 673 * Boot one CPU.
628 */ 674 */
629static int __cpuinit do_boot_cpu(int cpu, int apicid) 675static int __cpuinit do_boot_cpu(int cpu, int apicid)
630{ 676{
631 struct task_struct *idle;
632 unsigned long boot_error; 677 unsigned long boot_error;
633 int timeout; 678 int timeout;
634 unsigned long start_rip; 679 unsigned long start_rip;
680 struct create_idle c_idle = {
681 .cpu = cpu,
682 .done = COMPLETION_INITIALIZER(c_idle.done),
683 };
684 DECLARE_WORK(work, do_fork_idle, &c_idle);
685
686 c_idle.idle = get_idle_for_cpu(cpu);
687
688 if (c_idle.idle) {
689 c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
690 (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1);
691 init_idle(c_idle.idle, cpu);
692 goto do_rest;
693 }
694
635 /* 695 /*
636 * We can't use kernel_thread since we must avoid to 696 * During cold boot process, keventd thread is not spun up yet.
637 * reschedule the child. 697 * When we do cpu hot-add, we create idle threads on the fly, we should
698 * not acquire any attributes from the calling context. Hence the clean
699 * way to create kernel_threads() is to do that from keventd().
700 * We do the current_is_keventd() due to the fact that ACPI notifier
701 * was also queuing to keventd() and when the caller is already running
702 * in context of keventd(), we would end up with locking up the keventd
703 * thread.
638 */ 704 */
639 idle = fork_idle(cpu); 705 if (!keventd_up() || current_is_keventd())
640 if (IS_ERR(idle)) { 706 work.func(work.data);
707 else {
708 schedule_work(&work);
709 wait_for_completion(&c_idle.done);
710 }
711
712 if (IS_ERR(c_idle.idle)) {
641 printk("failed fork for CPU %d\n", cpu); 713 printk("failed fork for CPU %d\n", cpu);
642 return PTR_ERR(idle); 714 return PTR_ERR(c_idle.idle);
643 } 715 }
644 716
645 cpu_pda[cpu].pcurrent = idle; 717 set_idle_for_cpu(cpu, c_idle.idle);
718
719do_rest:
720
721 cpu_pda[cpu].pcurrent = c_idle.idle;
646 722
647 start_rip = setup_trampoline(); 723 start_rip = setup_trampoline();
648 724
649 init_rsp = idle->thread.rsp; 725 init_rsp = c_idle.idle->thread.rsp;
650 per_cpu(init_tss,cpu).rsp0 = init_rsp; 726 per_cpu(init_tss,cpu).rsp0 = init_rsp;
651 initial_code = start_secondary; 727 initial_code = start_secondary;
652 clear_ti_thread_flag(idle->thread_info, TIF_FORK); 728 clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
653 729
654 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, 730 printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
655 start_rip, init_rsp); 731 start_rip, init_rsp);
@@ -925,10 +1001,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
925 int apicid = cpu_present_to_apicid(i); 1001 int apicid = cpu_present_to_apicid(i);
926 if (physid_isset(apicid, phys_cpu_present_map)) { 1002 if (physid_isset(apicid, phys_cpu_present_map)) {
927 cpu_set(i, cpu_present_map); 1003 cpu_set(i, cpu_present_map);
928 /* possible map would be different if we supported real
929 CPU hotplug. */
930 cpu_set(i, cpu_possible_map); 1004 cpu_set(i, cpu_possible_map);
931 } 1005 }
1006 fixup_cpu_possible_map(i);
932 } 1007 }
933 1008
934 if (smp_sanity_check(max_cpus) < 0) { 1009 if (smp_sanity_check(max_cpus) < 0) {
@@ -977,9 +1052,6 @@ void __init smp_prepare_boot_cpu(void)
977 1052
978/* 1053/*
979 * Entry point to boot a CPU. 1054 * Entry point to boot a CPU.
980 *
981 * This is all __cpuinit, not __devinit for now because we don't support
982 * CPU hotplug (yet).
983 */ 1055 */
984int __cpuinit __cpu_up(unsigned int cpu) 1056int __cpuinit __cpu_up(unsigned int cpu)
985{ 1057{
@@ -996,6 +1068,14 @@ int __cpuinit __cpu_up(unsigned int cpu)
996 return -EINVAL; 1068 return -EINVAL;
997 } 1069 }
998 1070
1071 /*
1072 * Already booted CPU?
1073 */
1074 if (cpu_isset(cpu, cpu_callin_map)) {
1075 Dprintk("do_boot_cpu %d Already started\n", cpu);
1076 return -ENOSYS;
1077 }
1078
999 /* Boot it! */ 1079 /* Boot it! */
1000 err = do_boot_cpu(cpu, apicid); 1080 err = do_boot_cpu(cpu, apicid);
1001 if (err < 0) { 1081 if (err < 0) {
@@ -1008,7 +1088,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
1008 1088
1009 while (!cpu_isset(cpu, cpu_online_map)) 1089 while (!cpu_isset(cpu, cpu_online_map))
1010 cpu_relax(); 1090 cpu_relax();
1011 return 0; 1091 err = 0;
1092
1093 return err;
1012} 1094}
1013 1095
1014/* 1096/*
@@ -1016,7 +1098,9 @@ int __cpuinit __cpu_up(unsigned int cpu)
1016 */ 1098 */
1017void __init smp_cpus_done(unsigned int max_cpus) 1099void __init smp_cpus_done(unsigned int max_cpus)
1018{ 1100{
1101#ifndef CONFIG_HOTPLUG_CPU
1019 zap_low_mappings(); 1102 zap_low_mappings();
1103#endif
1020 smp_cleanup_boot(); 1104 smp_cleanup_boot();
1021 1105
1022#ifdef CONFIG_X86_IO_APIC 1106#ifdef CONFIG_X86_IO_APIC
@@ -1028,3 +1112,94 @@ void __init smp_cpus_done(unsigned int max_cpus)
1028 1112
1029 check_nmi_watchdog(); 1113 check_nmi_watchdog();
1030} 1114}
1115
1116#ifdef CONFIG_HOTPLUG_CPU
1117
1118static void
1119remove_siblinginfo(int cpu)
1120{
1121 int sibling;
1122
1123 for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
1124 cpu_clear(cpu, cpu_sibling_map[sibling]);
1125 for_each_cpu_mask(sibling, cpu_core_map[cpu])
1126 cpu_clear(cpu, cpu_core_map[sibling]);
1127 cpus_clear(cpu_sibling_map[cpu]);
1128 cpus_clear(cpu_core_map[cpu]);
1129 phys_proc_id[cpu] = BAD_APICID;
1130 cpu_core_id[cpu] = BAD_APICID;
1131}
1132
1133void remove_cpu_from_maps(void)
1134{
1135 int cpu = smp_processor_id();
1136
1137 cpu_clear(cpu, cpu_callout_map);
1138 cpu_clear(cpu, cpu_callin_map);
1139 clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
1140}
1141
1142int __cpu_disable(void)
1143{
1144 int cpu = smp_processor_id();
1145
1146 /*
1147 * Perhaps use cpufreq to drop frequency, but that could go
1148 * into generic code.
1149 *
1150 * We won't take down the boot processor on i386 due to some
1151 * interrupts only being able to be serviced by the BSP.
1152 * Especially so if we're not using an IOAPIC -zwane
1153 */
1154 if (cpu == 0)
1155 return -EBUSY;
1156
1157 disable_APIC_timer();
1158
1159 /*
1160 * HACK:
1161 * Allow any queued timer interrupts to get serviced
1162 * This is only a temporary solution until we cleanup
1163 * fixup_irqs as we do for IA64.
1164 */
1165 local_irq_enable();
1166 mdelay(1);
1167
1168 local_irq_disable();
1169 remove_siblinginfo(cpu);
1170
1171 /* It's now safe to remove this processor from the online map */
1172 cpu_clear(cpu, cpu_online_map);
1173 remove_cpu_from_maps();
1174 fixup_irqs(cpu_online_map);
1175 return 0;
1176}
1177
1178void __cpu_die(unsigned int cpu)
1179{
1180 /* We don't do anything here: idle task is faking death itself. */
1181 unsigned int i;
1182
1183 for (i = 0; i < 10; i++) {
1184 /* They ack this in play_dead by setting CPU_DEAD */
1185 if (per_cpu(cpu_state, cpu) == CPU_DEAD)
1186 return;
1187 current->state = TASK_UNINTERRUPTIBLE;
1188 schedule_timeout(HZ/10);
1189 }
1190 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1191}
1192
1193#else /* ... !CONFIG_HOTPLUG_CPU */
1194
1195int __cpu_disable(void)
1196{
1197 return -ENOSYS;
1198}
1199
1200void __cpu_die(unsigned int cpu)
1201{
1202 /* We said "no" in __cpu_disable */
1203 BUG();
1204}
1205#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 121646fc43f6..102736630002 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -586,11 +586,17 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
586asmlinkage void default_do_nmi(struct pt_regs *regs) 586asmlinkage void default_do_nmi(struct pt_regs *regs)
587{ 587{
588 unsigned char reason = 0; 588 unsigned char reason = 0;
589 int cpu;
590
591 cpu = smp_processor_id();
589 592
590 /* Only the BSP gets external NMIs from the system. */ 593 /* Only the BSP gets external NMIs from the system. */
591 if (!smp_processor_id()) 594 if (!cpu)
592 reason = get_nmi_reason(); 595 reason = get_nmi_reason();
593 596
597 if (!cpu_online(cpu))
598 return;
599
594 if (!(reason & 0xc0)) { 600 if (!(reason & 0xc0)) {
595 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) 601 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
596 == NOTIFY_STOP) 602 == NOTIFY_STOP)