aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386/kernel
diff options
context:
space:
mode:
authorDave Jones <davej@redhat.com>2006-12-12 17:41:41 -0500
committerDave Jones <davej@redhat.com>2006-12-12 17:41:41 -0500
commitc4366889dda8110247be59ca41fddb82951a8c26 (patch)
tree705c1a996bed8fd48ce94ff33ec9fd00f9b94875 /arch/i386/kernel
parentdb2fb9db5735cc532fd4fc55e94b9a3c3750378e (diff)
parente1036502e5263851259d147771226161e5ccc85a (diff)
Merge ../linus
Conflicts: drivers/cpufreq/cpufreq.c
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r--arch/i386/kernel/Makefile5
-rw-r--r--arch/i386/kernel/acpi/boot.c20
-rw-r--r--arch/i386/kernel/acpi/cstate.c7
-rw-r--r--arch/i386/kernel/acpi/earlyquirk.c29
-rw-r--r--arch/i386/kernel/alternative.c68
-rw-r--r--arch/i386/kernel/apic.c22
-rw-r--r--arch/i386/kernel/apm.c42
-rw-r--r--arch/i386/kernel/asm-offsets.c39
-rw-r--r--arch/i386/kernel/cpu/amd.c5
-rw-r--r--arch/i386/kernel/cpu/common.c249
-rw-r--r--arch/i386/kernel/cpu/intel.c12
-rw-r--r--arch/i386/kernel/cpu/intel_cacheinfo.c11
-rw-r--r--arch/i386/kernel/cpu/mcheck/non-fatal.c6
-rw-r--r--arch/i386/kernel/cpu/mcheck/therm_throt.c3
-rw-r--r--arch/i386/kernel/cpu/mtrr/Makefile4
-rw-r--r--arch/i386/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/centaur.c9
-rw-r--r--arch/i386/kernel/cpu/mtrr/cyrix.c25
-rw-r--r--arch/i386/kernel/cpu/mtrr/generic.c78
-rw-r--r--arch/i386/kernel/cpu/mtrr/if.c31
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c71
-rw-r--r--arch/i386/kernel/cpu/mtrr/mtrr.h25
-rw-r--r--arch/i386/kernel/cpu/proc.c3
-rw-r--r--arch/i386/kernel/cpuid.c27
-rw-r--r--arch/i386/kernel/crash.c66
-rw-r--r--arch/i386/kernel/e820.c894
-rw-r--r--arch/i386/kernel/efi.c17
-rw-r--r--arch/i386/kernel/entry.S331
-rw-r--r--arch/i386/kernel/head.S68
-rw-r--r--arch/i386/kernel/hpet.c7
-rw-r--r--arch/i386/kernel/i8253.c2
-rw-r--r--arch/i386/kernel/i8259.c12
-rw-r--r--arch/i386/kernel/io_apic.c179
-rw-r--r--arch/i386/kernel/irq.c2
-rw-r--r--arch/i386/kernel/kprobes.c26
-rw-r--r--arch/i386/kernel/ldt.c4
-rw-r--r--arch/i386/kernel/mca.c13
-rw-r--r--arch/i386/kernel/microcode.c4
-rw-r--r--arch/i386/kernel/module.c15
-rw-r--r--arch/i386/kernel/mpparse.c2
-rw-r--r--arch/i386/kernel/msr.c31
-rw-r--r--arch/i386/kernel/nmi.c60
-rw-r--r--arch/i386/kernel/paravirt.c569
-rw-r--r--arch/i386/kernel/pci-dma.c10
-rw-r--r--arch/i386/kernel/process.c91
-rw-r--r--arch/i386/kernel/ptrace.c18
-rw-r--r--arch/i386/kernel/quirks.c69
-rw-r--r--arch/i386/kernel/reboot.c1
-rw-r--r--arch/i386/kernel/setup.c855
-rw-r--r--arch/i386/kernel/signal.c6
-rw-r--r--arch/i386/kernel/smp.c10
-rw-r--r--arch/i386/kernel/smpboot.c84
-rw-r--r--arch/i386/kernel/sysenter.c6
-rw-r--r--arch/i386/kernel/time.c15
-rw-r--r--arch/i386/kernel/time_hpet.c15
-rw-r--r--arch/i386/kernel/topology.c8
-rw-r--r--arch/i386/kernel/traps.c163
-rw-r--r--arch/i386/kernel/tsc.c11
-rw-r--r--arch/i386/kernel/vm86.c121
-rw-r--r--arch/i386/kernel/vmlinux.lds.S161
60 files changed, 2980 insertions, 1759 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 1a884b6e6e5c..1e8988e558c5 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -6,7 +6,7 @@ extra-y := head.o init_task.o vmlinux.lds
6 6
7obj-y := process.o signal.o entry.o traps.o irq.o \ 7obj-y := process.o signal.o entry.o traps.o irq.o \
8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ 8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
9 pci-dma.o i386_ksyms.o i387.o bootflag.o \ 9 pci-dma.o i386_ksyms.o i387.o bootflag.o e820.o\
10 quirks.o i8237.o topology.o alternative.o i8253.o tsc.o 10 quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
11 11
12obj-$(CONFIG_STACKTRACE) += stacktrace.o 12obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -40,6 +40,9 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
40obj-$(CONFIG_HPET_TIMER) += hpet.o 40obj-$(CONFIG_HPET_TIMER) += hpet.o
41obj-$(CONFIG_K8_NB) += k8.o 41obj-$(CONFIG_K8_NB) += k8.o
42 42
43# Make sure this is linked after any other paravirt_ops structs: see head.S
44obj-$(CONFIG_PARAVIRT) += paravirt.o
45
43EXTRA_AFLAGS := -traditional 46EXTRA_AFLAGS := -traditional
44 47
45obj-$(CONFIG_SCx200) += scx200.o 48obj-$(CONFIG_SCx200) += scx200.o
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index ab974ff97073..c8f96cff07c6 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -70,7 +70,7 @@ static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return
70 70
71#define PREFIX "ACPI: " 71#define PREFIX "ACPI: "
72 72
73int acpi_noirq __initdata; /* skip ACPI IRQ initialization */ 73int acpi_noirq; /* skip ACPI IRQ initialization */
74int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */ 74int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */
75int acpi_ht __initdata = 1; /* enable HT */ 75int acpi_ht __initdata = 1; /* enable HT */
76 76
@@ -82,6 +82,7 @@ EXPORT_SYMBOL(acpi_strict);
82acpi_interrupt_flags acpi_sci_flags __initdata; 82acpi_interrupt_flags acpi_sci_flags __initdata;
83int acpi_sci_override_gsi __initdata; 83int acpi_sci_override_gsi __initdata;
84int acpi_skip_timer_override __initdata; 84int acpi_skip_timer_override __initdata;
85int acpi_use_timer_override __initdata;
85 86
86#ifdef CONFIG_X86_LOCAL_APIC 87#ifdef CONFIG_X86_LOCAL_APIC
87static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; 88static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
@@ -332,7 +333,7 @@ acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end)
332/* 333/*
333 * Parse Interrupt Source Override for the ACPI SCI 334 * Parse Interrupt Source Override for the ACPI SCI
334 */ 335 */
335static void acpi_sci_ioapic_setup(u32 bus_irq, u32 gsi, u16 polarity, u16 trigger) 336static void acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
336{ 337{
337 if (trigger == 0) /* compatible SCI trigger is level */ 338 if (trigger == 0) /* compatible SCI trigger is level */
338 trigger = 3; 339 trigger = 3;
@@ -352,13 +353,13 @@ static void acpi_sci_ioapic_setup(u32 bus_irq, u32 gsi, u16 polarity, u16 trigge
352 * If GSI is < 16, this will update its flags, 353 * If GSI is < 16, this will update its flags,
353 * else it will create a new mp_irqs[] entry. 354 * else it will create a new mp_irqs[] entry.
354 */ 355 */
355 mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); 356 mp_override_legacy_irq(gsi, polarity, trigger, gsi);
356 357
357 /* 358 /*
358 * stash over-ride to indicate we've been here 359 * stash over-ride to indicate we've been here
359 * and for later update of acpi_fadt 360 * and for later update of acpi_fadt
360 */ 361 */
361 acpi_sci_override_gsi = bus_irq; 362 acpi_sci_override_gsi = gsi;
362 return; 363 return;
363} 364}
364 365
@@ -376,7 +377,7 @@ acpi_parse_int_src_ovr(acpi_table_entry_header * header,
376 acpi_table_print_madt_entry(header); 377 acpi_table_print_madt_entry(header);
377 378
378 if (intsrc->bus_irq == acpi_fadt.sci_int) { 379 if (intsrc->bus_irq == acpi_fadt.sci_int) {
379 acpi_sci_ioapic_setup(intsrc->bus_irq, intsrc->global_irq, 380 acpi_sci_ioapic_setup(intsrc->global_irq,
380 intsrc->flags.polarity, 381 intsrc->flags.polarity,
381 intsrc->flags.trigger); 382 intsrc->flags.trigger);
382 return 0; 383 return 0;
@@ -879,7 +880,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
879 * pretend we got one so we can set the SCI flags. 880 * pretend we got one so we can set the SCI flags.
880 */ 881 */
881 if (!acpi_sci_override_gsi) 882 if (!acpi_sci_override_gsi)
882 acpi_sci_ioapic_setup(acpi_fadt.sci_int, acpi_fadt.sci_int, 0, 0); 883 acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0);
883 884
884 /* Fill in identity legacy mapings where no override */ 885 /* Fill in identity legacy mapings where no override */
885 mp_config_acpi_legacy_irqs(); 886 mp_config_acpi_legacy_irqs();
@@ -1300,6 +1301,13 @@ static int __init parse_acpi_skip_timer_override(char *arg)
1300 return 0; 1301 return 0;
1301} 1302}
1302early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override); 1303early_param("acpi_skip_timer_override", parse_acpi_skip_timer_override);
1304
1305static int __init parse_acpi_use_timer_override(char *arg)
1306{
1307 acpi_use_timer_override = 1;
1308 return 0;
1309}
1310early_param("acpi_use_timer_override", parse_acpi_use_timer_override);
1303#endif /* CONFIG_X86_IO_APIC */ 1311#endif /* CONFIG_X86_IO_APIC */
1304 1312
1305static int __init setup_acpi_sci(char *s) 1313static int __init setup_acpi_sci(char *s)
diff --git a/arch/i386/kernel/acpi/cstate.c b/arch/i386/kernel/acpi/cstate.c
index 20563e52c622..12e937c1ce4b 100644
--- a/arch/i386/kernel/acpi/cstate.c
+++ b/arch/i386/kernel/acpi/cstate.c
@@ -11,6 +11,7 @@
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/acpi.h> 12#include <linux/acpi.h>
13#include <linux/cpu.h> 13#include <linux/cpu.h>
14#include <linux/sched.h>
14 15
15#include <acpi/processor.h> 16#include <acpi/processor.h>
16#include <asm/acpi.h> 17#include <asm/acpi.h>
@@ -155,10 +156,8 @@ static int __init ffh_cstate_init(void)
155 156
156static void __exit ffh_cstate_exit(void) 157static void __exit ffh_cstate_exit(void)
157{ 158{
158 if (cpu_cstate_entry) { 159 free_percpu(cpu_cstate_entry);
159 free_percpu(cpu_cstate_entry); 160 cpu_cstate_entry = NULL;
160 cpu_cstate_entry = NULL;
161 }
162} 161}
163 162
164arch_initcall(ffh_cstate_init); 163arch_initcall(ffh_cstate_init);
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index fe799b11ac0a..4b60af7f91dd 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,6 +10,7 @@
10#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
11#include <asm/acpi.h> 11#include <asm/acpi.h>
12#include <asm/apic.h> 12#include <asm/apic.h>
13#include <asm/irq.h>
13 14
14#ifdef CONFIG_ACPI 15#ifdef CONFIG_ACPI
15 16
@@ -27,11 +28,17 @@ static int __init check_bridge(int vendor, int device)
27#ifdef CONFIG_ACPI 28#ifdef CONFIG_ACPI
28 /* According to Nvidia all timer overrides are bogus unless HPET 29 /* According to Nvidia all timer overrides are bogus unless HPET
29 is enabled. */ 30 is enabled. */
30 if (vendor == PCI_VENDOR_ID_NVIDIA) { 31 if (!acpi_use_timer_override && vendor == PCI_VENDOR_ID_NVIDIA) {
31 nvidia_hpet_detected = 0; 32 nvidia_hpet_detected = 0;
32 acpi_table_parse(ACPI_HPET, nvidia_hpet_check); 33 acpi_table_parse(ACPI_HPET, nvidia_hpet_check);
33 if (nvidia_hpet_detected == 0) { 34 if (nvidia_hpet_detected == 0) {
34 acpi_skip_timer_override = 1; 35 acpi_skip_timer_override = 1;
36 printk(KERN_INFO "Nvidia board "
37 "detected. Ignoring ACPI "
38 "timer override.\n");
39 printk(KERN_INFO "If you got timer trouble "
40 "try acpi_use_timer_override\n");
41
35 } 42 }
36 } 43 }
37#endif 44#endif
@@ -43,6 +50,24 @@ static int __init check_bridge(int vendor, int device)
43 return 0; 50 return 0;
44} 51}
45 52
53static void check_intel(void)
54{
55 u16 vendor, device;
56
57 vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
58
59 if (vendor != PCI_VENDOR_ID_INTEL)
60 return;
61
62 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
63#ifdef CONFIG_SMP
64 if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
65 device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
66 device == PCI_DEVICE_ID_INTEL_E7525_MCH)
67 quirk_intel_irqbalance();
68#endif
69}
70
46void __init check_acpi_pci(void) 71void __init check_acpi_pci(void)
47{ 72{
48 int num, slot, func; 73 int num, slot, func;
@@ -54,6 +79,8 @@ void __init check_acpi_pci(void)
54 if (!early_pci_allowed()) 79 if (!early_pci_allowed())
55 return; 80 return;
56 81
82 check_intel();
83
57 /* Poor man's PCI discovery */ 84 /* Poor man's PCI discovery */
58 for (num = 0; num < 32; num++) { 85 for (num = 0; num < 32; num++) {
59 for (slot = 0; slot < 32; slot++) { 86 for (slot = 0; slot < 32; slot++) {
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 28ab80649764..9eca21b49f6b 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -1,4 +1,5 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/sched.h>
2#include <linux/spinlock.h> 3#include <linux/spinlock.h>
3#include <linux/list.h> 4#include <linux/list.h>
4#include <asm/alternative.h> 5#include <asm/alternative.h>
@@ -123,6 +124,20 @@ static unsigned char** find_nop_table(void)
123 124
124#endif /* CONFIG_X86_64 */ 125#endif /* CONFIG_X86_64 */
125 126
127static void nop_out(void *insns, unsigned int len)
128{
129 unsigned char **noptable = find_nop_table();
130
131 while (len > 0) {
132 unsigned int noplen = len;
133 if (noplen > ASM_NOP_MAX)
134 noplen = ASM_NOP_MAX;
135 memcpy(insns, noptable[noplen], noplen);
136 insns += noplen;
137 len -= noplen;
138 }
139}
140
126extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 141extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
127extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; 142extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
128extern u8 *__smp_locks[], *__smp_locks_end[]; 143extern u8 *__smp_locks[], *__smp_locks_end[];
@@ -137,10 +152,9 @@ extern u8 __smp_alt_begin[], __smp_alt_end[];
137 152
138void apply_alternatives(struct alt_instr *start, struct alt_instr *end) 153void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
139{ 154{
140 unsigned char **noptable = find_nop_table();
141 struct alt_instr *a; 155 struct alt_instr *a;
142 u8 *instr; 156 u8 *instr;
143 int diff, i, k; 157 int diff;
144 158
145 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); 159 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
146 for (a = start; a < end; a++) { 160 for (a = start; a < end; a++) {
@@ -158,13 +172,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
158#endif 172#endif
159 memcpy(instr, a->replacement, a->replacementlen); 173 memcpy(instr, a->replacement, a->replacementlen);
160 diff = a->instrlen - a->replacementlen; 174 diff = a->instrlen - a->replacementlen;
161 /* Pad the rest with nops */ 175 nop_out(instr + a->replacementlen, diff);
162 for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
163 k = diff;
164 if (k > ASM_NOP_MAX)
165 k = ASM_NOP_MAX;
166 memcpy(a->instr + i, noptable[k], k);
167 }
168 } 176 }
169} 177}
170 178
@@ -208,7 +216,6 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
208 216
209static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 217static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
210{ 218{
211 unsigned char **noptable = find_nop_table();
212 u8 **ptr; 219 u8 **ptr;
213 220
214 for (ptr = start; ptr < end; ptr++) { 221 for (ptr = start; ptr < end; ptr++) {
@@ -216,7 +223,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
216 continue; 223 continue;
217 if (*ptr > text_end) 224 if (*ptr > text_end)
218 continue; 225 continue;
219 **ptr = noptable[1][0]; 226 nop_out(*ptr, 1);
220 }; 227 };
221} 228}
222 229
@@ -342,8 +349,43 @@ void alternatives_smp_switch(int smp)
342 349
343#endif 350#endif
344 351
352#ifdef CONFIG_PARAVIRT
353void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
354{
355 struct paravirt_patch *p;
356
357 for (p = start; p < end; p++) {
358 unsigned int used;
359
360 used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
361 p->len);
362#ifdef CONFIG_DEBUG_PARAVIRT
363 {
364 int i;
365 /* Deliberately clobber regs using "not %reg" to find bugs. */
366 for (i = 0; i < 3; i++) {
367 if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
368 memcpy(p->instr + used, "\xf7\xd0", 2);
369 p->instr[used+1] |= i;
370 used += 2;
371 }
372 }
373 }
374#endif
375 /* Pad the rest with nops */
376 nop_out(p->instr + used, p->len - used);
377 }
378
379 /* Sync to be conservative, in case we patched following instructions */
380 sync_core();
381}
382extern struct paravirt_patch __start_parainstructions[],
383 __stop_parainstructions[];
384#endif /* CONFIG_PARAVIRT */
385
345void __init alternative_instructions(void) 386void __init alternative_instructions(void)
346{ 387{
388 unsigned long flags;
347 if (no_replacement) { 389 if (no_replacement) {
348 printk(KERN_INFO "(SMP-)alternatives turned off\n"); 390 printk(KERN_INFO "(SMP-)alternatives turned off\n");
349 free_init_pages("SMP alternatives", 391 free_init_pages("SMP alternatives",
@@ -351,6 +393,8 @@ void __init alternative_instructions(void)
351 (unsigned long)__smp_alt_end); 393 (unsigned long)__smp_alt_end);
352 return; 394 return;
353 } 395 }
396
397 local_irq_save(flags);
354 apply_alternatives(__alt_instructions, __alt_instructions_end); 398 apply_alternatives(__alt_instructions, __alt_instructions_end);
355 399
356 /* switch to patch-once-at-boottime-only mode and free the 400 /* switch to patch-once-at-boottime-only mode and free the
@@ -386,4 +430,6 @@ void __init alternative_instructions(void)
386 alternatives_smp_switch(0); 430 alternatives_smp_switch(0);
387 } 431 }
388#endif 432#endif
433 apply_paravirt(__start_parainstructions, __stop_parainstructions);
434 local_irq_restore(flags);
389} 435}
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 2fd4b7d927c2..776d9be26af9 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -647,23 +647,30 @@ static struct {
647static int lapic_suspend(struct sys_device *dev, pm_message_t state) 647static int lapic_suspend(struct sys_device *dev, pm_message_t state)
648{ 648{
649 unsigned long flags; 649 unsigned long flags;
650 int maxlvt;
650 651
651 if (!apic_pm_state.active) 652 if (!apic_pm_state.active)
652 return 0; 653 return 0;
653 654
655 maxlvt = get_maxlvt();
656
654 apic_pm_state.apic_id = apic_read(APIC_ID); 657 apic_pm_state.apic_id = apic_read(APIC_ID);
655 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); 658 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
656 apic_pm_state.apic_ldr = apic_read(APIC_LDR); 659 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
657 apic_pm_state.apic_dfr = apic_read(APIC_DFR); 660 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
658 apic_pm_state.apic_spiv = apic_read(APIC_SPIV); 661 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
659 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); 662 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
660 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); 663 if (maxlvt >= 4)
664 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
661 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); 665 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
662 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); 666 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
663 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 667 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
664 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 668 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
665 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 669 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
666 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 670#ifdef CONFIG_X86_MCE_P4THERMAL
671 if (maxlvt >= 5)
672 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
673#endif
667 674
668 local_irq_save(flags); 675 local_irq_save(flags);
669 disable_local_APIC(); 676 disable_local_APIC();
@@ -675,10 +682,13 @@ static int lapic_resume(struct sys_device *dev)
675{ 682{
676 unsigned int l, h; 683 unsigned int l, h;
677 unsigned long flags; 684 unsigned long flags;
685 int maxlvt;
678 686
679 if (!apic_pm_state.active) 687 if (!apic_pm_state.active)
680 return 0; 688 return 0;
681 689
690 maxlvt = get_maxlvt();
691
682 local_irq_save(flags); 692 local_irq_save(flags);
683 693
684 /* 694 /*
@@ -700,8 +710,12 @@ static int lapic_resume(struct sys_device *dev)
700 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 710 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
701 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 711 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
702 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 712 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
703 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 713#ifdef CONFIG_X86_MCE_P4THERMAL
704 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); 714 if (maxlvt >= 5)
715 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
716#endif
717 if (maxlvt >= 4)
718 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
705 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); 719 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
706 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); 720 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
707 apic_write(APIC_TMICT, apic_pm_state.apic_tmict); 721 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index b42f2d914af3..a97847da9ed5 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -198,7 +198,7 @@
198 * (APM) BIOS Interface Specification, Revision 1.2, February 1996. 198 * (APM) BIOS Interface Specification, Revision 1.2, February 1996.
199 * 199 *
200 * [This document is available from Microsoft at: 200 * [This document is available from Microsoft at:
201 * http://www.microsoft.com/hwdev/busbios/amp_12.htm] 201 * http://www.microsoft.com/whdc/archive/amp_12.mspx]
202 */ 202 */
203 203
204#include <linux/module.h> 204#include <linux/module.h>
@@ -231,6 +231,7 @@
231#include <asm/uaccess.h> 231#include <asm/uaccess.h>
232#include <asm/desc.h> 232#include <asm/desc.h>
233#include <asm/i8253.h> 233#include <asm/i8253.h>
234#include <asm/paravirt.h>
234 235
235#include "io_ports.h" 236#include "io_ports.h"
236 237
@@ -540,11 +541,30 @@ static inline void apm_restore_cpus(cpumask_t mask)
540 * Also, we KNOW that for the non error case of apm_bios_call, there 541 * Also, we KNOW that for the non error case of apm_bios_call, there
541 * is no useful data returned in the low order 8 bits of eax. 542 * is no useful data returned in the low order 8 bits of eax.
542 */ 543 */
543#define APM_DO_CLI \ 544
544 if (apm_info.allow_ints) \ 545static inline unsigned long __apm_irq_save(void)
545 local_irq_enable(); \ 546{
546 else \ 547 unsigned long flags;
548 local_save_flags(flags);
549 if (apm_info.allow_ints) {
550 if (irqs_disabled_flags(flags))
551 local_irq_enable();
552 } else
553 local_irq_disable();
554
555 return flags;
556}
557
558#define apm_irq_save(flags) \
559 do { flags = __apm_irq_save(); } while (0)
560
561static inline void apm_irq_restore(unsigned long flags)
562{
563 if (irqs_disabled_flags(flags))
547 local_irq_disable(); 564 local_irq_disable();
565 else if (irqs_disabled())
566 local_irq_enable();
567}
548 568
549#ifdef APM_ZERO_SEGS 569#ifdef APM_ZERO_SEGS
550# define APM_DECL_SEGS \ 570# define APM_DECL_SEGS \
@@ -596,12 +616,11 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in,
596 save_desc_40 = gdt[0x40 / 8]; 616 save_desc_40 = gdt[0x40 / 8];
597 gdt[0x40 / 8] = bad_bios_desc; 617 gdt[0x40 / 8] = bad_bios_desc;
598 618
599 local_save_flags(flags); 619 apm_irq_save(flags);
600 APM_DO_CLI;
601 APM_DO_SAVE_SEGS; 620 APM_DO_SAVE_SEGS;
602 apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi); 621 apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi);
603 APM_DO_RESTORE_SEGS; 622 APM_DO_RESTORE_SEGS;
604 local_irq_restore(flags); 623 apm_irq_restore(flags);
605 gdt[0x40 / 8] = save_desc_40; 624 gdt[0x40 / 8] = save_desc_40;
606 put_cpu(); 625 put_cpu();
607 apm_restore_cpus(cpus); 626 apm_restore_cpus(cpus);
@@ -640,12 +659,11 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax)
640 save_desc_40 = gdt[0x40 / 8]; 659 save_desc_40 = gdt[0x40 / 8];
641 gdt[0x40 / 8] = bad_bios_desc; 660 gdt[0x40 / 8] = bad_bios_desc;
642 661
643 local_save_flags(flags); 662 apm_irq_save(flags);
644 APM_DO_CLI;
645 APM_DO_SAVE_SEGS; 663 APM_DO_SAVE_SEGS;
646 error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax); 664 error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax);
647 APM_DO_RESTORE_SEGS; 665 APM_DO_RESTORE_SEGS;
648 local_irq_restore(flags); 666 apm_irq_restore(flags);
649 gdt[0x40 / 8] = save_desc_40; 667 gdt[0x40 / 8] = save_desc_40;
650 put_cpu(); 668 put_cpu();
651 apm_restore_cpus(cpus); 669 apm_restore_cpus(cpus);
@@ -2218,7 +2236,7 @@ static int __init apm_init(void)
2218 2236
2219 dmi_check_system(apm_dmi_table); 2237 dmi_check_system(apm_dmi_table);
2220 2238
2221 if (apm_info.bios.version == 0) { 2239 if (apm_info.bios.version == 0 || paravirt_enabled()) {
2222 printk(KERN_INFO "apm: BIOS not found.\n"); 2240 printk(KERN_INFO "apm: BIOS not found.\n");
2223 return -ENODEV; 2241 return -ENODEV;
2224 } 2242 }
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index c80271f8f084..1b2f3cd33270 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -15,6 +15,7 @@
15#include <asm/processor.h> 15#include <asm/processor.h>
16#include <asm/thread_info.h> 16#include <asm/thread_info.h>
17#include <asm/elf.h> 17#include <asm/elf.h>
18#include <asm/pda.h>
18 19
19#define DEFINE(sym, val) \ 20#define DEFINE(sym, val) \
20 asm volatile("\n->" #sym " %0 " #val : : "i" (val)) 21 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -51,13 +52,35 @@ void foo(void)
51 OFFSET(TI_exec_domain, thread_info, exec_domain); 52 OFFSET(TI_exec_domain, thread_info, exec_domain);
52 OFFSET(TI_flags, thread_info, flags); 53 OFFSET(TI_flags, thread_info, flags);
53 OFFSET(TI_status, thread_info, status); 54 OFFSET(TI_status, thread_info, status);
54 OFFSET(TI_cpu, thread_info, cpu);
55 OFFSET(TI_preempt_count, thread_info, preempt_count); 55 OFFSET(TI_preempt_count, thread_info, preempt_count);
56 OFFSET(TI_addr_limit, thread_info, addr_limit); 56 OFFSET(TI_addr_limit, thread_info, addr_limit);
57 OFFSET(TI_restart_block, thread_info, restart_block); 57 OFFSET(TI_restart_block, thread_info, restart_block);
58 OFFSET(TI_sysenter_return, thread_info, sysenter_return); 58 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
59 BLANK(); 59 BLANK();
60 60
61 OFFSET(GDS_size, Xgt_desc_struct, size);
62 OFFSET(GDS_address, Xgt_desc_struct, address);
63 OFFSET(GDS_pad, Xgt_desc_struct, pad);
64 BLANK();
65
66 OFFSET(PT_EBX, pt_regs, ebx);
67 OFFSET(PT_ECX, pt_regs, ecx);
68 OFFSET(PT_EDX, pt_regs, edx);
69 OFFSET(PT_ESI, pt_regs, esi);
70 OFFSET(PT_EDI, pt_regs, edi);
71 OFFSET(PT_EBP, pt_regs, ebp);
72 OFFSET(PT_EAX, pt_regs, eax);
73 OFFSET(PT_DS, pt_regs, xds);
74 OFFSET(PT_ES, pt_regs, xes);
75 OFFSET(PT_GS, pt_regs, xgs);
76 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
77 OFFSET(PT_EIP, pt_regs, eip);
78 OFFSET(PT_CS, pt_regs, xcs);
79 OFFSET(PT_EFLAGS, pt_regs, eflags);
80 OFFSET(PT_OLDESP, pt_regs, esp);
81 OFFSET(PT_OLDSS, pt_regs, xss);
82 BLANK();
83
61 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); 84 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
62 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 85 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
63 BLANK(); 86 BLANK();
@@ -74,4 +97,18 @@ void foo(void)
74 DEFINE(VDSO_PRELINK, VDSO_PRELINK); 97 DEFINE(VDSO_PRELINK, VDSO_PRELINK);
75 98
76 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 99 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
100
101 BLANK();
102 OFFSET(PDA_cpu, i386_pda, cpu_number);
103 OFFSET(PDA_pcurrent, i386_pda, pcurrent);
104
105#ifdef CONFIG_PARAVIRT
106 BLANK();
107 OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
108 OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
109 OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
110 OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
111 OFFSET(PARAVIRT_iret, paravirt_ops, iret);
112 OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
113#endif
77} 114}
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index e4758095d87a..41cfea57232b 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -104,10 +104,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
104 f_vide(); 104 f_vide();
105 rdtscl(d2); 105 rdtscl(d2);
106 d = d2-d; 106 d = d2-d;
107 107
108 /* Knock these two lines out if it debugs out ok */
109 printk(KERN_INFO "AMD K6 stepping B detected - ");
110 /* -- cut here -- */
111 if (d > 20*K6_BUG_LOOP) 108 if (d > 20*K6_BUG_LOOP)
112 printk("system stability may be impaired when more than 32 MB are used.\n"); 109 printk("system stability may be impaired when more than 32 MB are used.\n");
113 else 110 else
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index d9f3e3c31f05..1b34c56f8123 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,14 +18,15 @@
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <mach_apic.h> 19#include <mach_apic.h>
20#endif 20#endif
21#include <asm/pda.h>
21 22
22#include "cpu.h" 23#include "cpu.h"
23 24
24DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); 25DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
25EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); 26EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
26 27
27DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); 28struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
28EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); 29EXPORT_SYMBOL(_cpu_pda);
29 30
30static int cachesize_override __cpuinitdata = -1; 31static int cachesize_override __cpuinitdata = -1;
31static int disable_x86_fxsr __cpuinitdata; 32static int disable_x86_fxsr __cpuinitdata;
@@ -235,29 +236,14 @@ static int __cpuinit have_cpuid_p(void)
235 return flag_is_changeable_p(X86_EFLAGS_ID); 236 return flag_is_changeable_p(X86_EFLAGS_ID);
236} 237}
237 238
238/* Do minimum CPU detection early. 239void __init cpu_detect(struct cpuinfo_x86 *c)
239 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
240 The others are not touched to avoid unwanted side effects.
241
242 WARNING: this function is only called on the BP. Don't add code here
243 that is supposed to run on all CPUs. */
244static void __init early_cpu_detect(void)
245{ 240{
246 struct cpuinfo_x86 *c = &boot_cpu_data;
247
248 c->x86_cache_alignment = 32;
249
250 if (!have_cpuid_p())
251 return;
252
253 /* Get vendor name */ 241 /* Get vendor name */
254 cpuid(0x00000000, &c->cpuid_level, 242 cpuid(0x00000000, &c->cpuid_level,
255 (int *)&c->x86_vendor_id[0], 243 (int *)&c->x86_vendor_id[0],
256 (int *)&c->x86_vendor_id[8], 244 (int *)&c->x86_vendor_id[8],
257 (int *)&c->x86_vendor_id[4]); 245 (int *)&c->x86_vendor_id[4]);
258 246
259 get_cpu_vendor(c, 1);
260
261 c->x86 = 4; 247 c->x86 = 4;
262 if (c->cpuid_level >= 0x00000001) { 248 if (c->cpuid_level >= 0x00000001) {
263 u32 junk, tfms, cap0, misc; 249 u32 junk, tfms, cap0, misc;
@@ -274,6 +260,26 @@ static void __init early_cpu_detect(void)
274 } 260 }
275} 261}
276 262
263/* Do minimum CPU detection early.
264 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
265 The others are not touched to avoid unwanted side effects.
266
267 WARNING: this function is only called on the BP. Don't add code here
268 that is supposed to run on all CPUs. */
269static void __init early_cpu_detect(void)
270{
271 struct cpuinfo_x86 *c = &boot_cpu_data;
272
273 c->x86_cache_alignment = 32;
274
275 if (!have_cpuid_p())
276 return;
277
278 cpu_detect(c);
279
280 get_cpu_vendor(c, 1);
281}
282
277static void __cpuinit generic_identify(struct cpuinfo_x86 * c) 283static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
278{ 284{
279 u32 tfms, xlvl; 285 u32 tfms, xlvl;
@@ -308,6 +314,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
308#else 314#else
309 c->apicid = (ebx >> 24) & 0xFF; 315 c->apicid = (ebx >> 24) & 0xFF;
310#endif 316#endif
317 if (c->x86_capability[0] & (1<<19))
318 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
311 } else { 319 } else {
312 /* Have CPUID level 0 only - unheard of */ 320 /* Have CPUID level 0 only - unheard of */
313 c->x86 = 4; 321 c->x86 = 4;
@@ -372,6 +380,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
372 c->x86_vendor_id[0] = '\0'; /* Unset */ 380 c->x86_vendor_id[0] = '\0'; /* Unset */
373 c->x86_model_id[0] = '\0'; /* Unset */ 381 c->x86_model_id[0] = '\0'; /* Unset */
374 c->x86_max_cores = 1; 382 c->x86_max_cores = 1;
383 c->x86_clflush_size = 32;
375 memset(&c->x86_capability, 0, sizeof c->x86_capability); 384 memset(&c->x86_capability, 0, sizeof c->x86_capability);
376 385
377 if (!have_cpuid_p()) { 386 if (!have_cpuid_p()) {
@@ -591,42 +600,24 @@ void __init early_cpu_init(void)
591 disable_pse = 1; 600 disable_pse = 1;
592#endif 601#endif
593} 602}
594/* 603
595 * cpu_init() initializes state that is per-CPU. Some data is already 604/* Make sure %gs is initialized properly in idle threads */
596 * initialized (naturally) in the bootstrap process, such as the GDT 605struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
597 * and IDT. We reload them nevertheless, this function acts as a
598 * 'CPU state barrier', nothing should get across.
599 */
600void __cpuinit cpu_init(void)
601{ 606{
602 int cpu = smp_processor_id(); 607 memset(regs, 0, sizeof(struct pt_regs));
603 struct tss_struct * t = &per_cpu(init_tss, cpu); 608 regs->xgs = __KERNEL_PDA;
604 struct thread_struct *thread = &current->thread; 609 return regs;
605 struct desc_struct *gdt; 610}
606 __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu);
607 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
608 611
609 if (cpu_test_and_set(cpu, cpu_initialized)) { 612static __cpuinit int alloc_gdt(int cpu)
610 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); 613{
611 for (;;) local_irq_enable(); 614 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
612 } 615 struct desc_struct *gdt;
613 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 616 struct i386_pda *pda;
614 617
615 if (cpu_has_vme || cpu_has_tsc || cpu_has_de) 618 gdt = (struct desc_struct *)cpu_gdt_descr->address;
616 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 619 pda = cpu_pda(cpu);
617 if (tsc_disable && cpu_has_tsc) {
618 printk(KERN_NOTICE "Disabling TSC...\n");
619 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
620 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
621 set_in_cr4(X86_CR4_TSD);
622 }
623 620
624 /* The CPU hotplug case */
625 if (cpu_gdt_descr->address) {
626 gdt = (struct desc_struct *)cpu_gdt_descr->address;
627 memset(gdt, 0, PAGE_SIZE);
628 goto old_gdt;
629 }
630 /* 621 /*
631 * This is a horrible hack to allocate the GDT. The problem 622 * This is a horrible hack to allocate the GDT. The problem
632 * is that cpu_init() is called really early for the boot CPU 623 * is that cpu_init() is called really early for the boot CPU
@@ -634,43 +625,130 @@ void __cpuinit cpu_init(void)
634 * CPUs, when bootmem will have gone away 625 * CPUs, when bootmem will have gone away
635 */ 626 */
636 if (NODE_DATA(0)->bdata->node_bootmem_map) { 627 if (NODE_DATA(0)->bdata->node_bootmem_map) {
637 gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); 628 BUG_ON(gdt != NULL || pda != NULL);
638 /* alloc_bootmem_pages panics on failure, so no check */ 629
630 gdt = alloc_bootmem_pages(PAGE_SIZE);
631 pda = alloc_bootmem(sizeof(*pda));
632 /* alloc_bootmem(_pages) panics on failure, so no check */
633
639 memset(gdt, 0, PAGE_SIZE); 634 memset(gdt, 0, PAGE_SIZE);
635 memset(pda, 0, sizeof(*pda));
640 } else { 636 } else {
641 gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); 637 /* GDT and PDA might already have been allocated if
642 if (unlikely(!gdt)) { 638 this is a CPU hotplug re-insertion. */
643 printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); 639 if (gdt == NULL)
644 for (;;) 640 gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
645 local_irq_enable(); 641
642 if (pda == NULL)
643 pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
644
645 if (unlikely(!gdt || !pda)) {
646 free_pages((unsigned long)gdt, 0);
647 kfree(pda);
648 return 0;
646 } 649 }
647 } 650 }
648old_gdt: 651
652 cpu_gdt_descr->address = (unsigned long)gdt;
653 cpu_pda(cpu) = pda;
654
655 return 1;
656}
657
658/* Initial PDA used by boot CPU */
659struct i386_pda boot_pda = {
660 ._pda = &boot_pda,
661 .cpu_number = 0,
662 .pcurrent = &init_task,
663};
664
665static inline void set_kernel_gs(void)
666{
667 /* Set %gs for this CPU's PDA. Memory clobber is to create a
668 barrier with respect to any PDA operations, so the compiler
669 doesn't move any before here. */
670 asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
671}
672
673/* Initialize the CPU's GDT and PDA. The boot CPU does this for
674 itself, but secondaries find this done for them. */
675__cpuinit int init_gdt(int cpu, struct task_struct *idle)
676{
677 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
678 struct desc_struct *gdt;
679 struct i386_pda *pda;
680
681 /* For non-boot CPUs, the GDT and PDA should already have been
682 allocated. */
683 if (!alloc_gdt(cpu)) {
684 printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
685 return 0;
686 }
687
688 gdt = (struct desc_struct *)cpu_gdt_descr->address;
689 pda = cpu_pda(cpu);
690
691 BUG_ON(gdt == NULL || pda == NULL);
692
649 /* 693 /*
650 * Initialize the per-CPU GDT with the boot GDT, 694 * Initialize the per-CPU GDT with the boot GDT,
651 * and set up the GDT descriptor: 695 * and set up the GDT descriptor:
652 */ 696 */
653 memcpy(gdt, cpu_gdt_table, GDT_SIZE); 697 memcpy(gdt, cpu_gdt_table, GDT_SIZE);
698 cpu_gdt_descr->size = GDT_SIZE - 1;
654 699
655 /* Set up GDT entry for 16bit stack */ 700 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
656 *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= 701 (u32 *)&gdt[GDT_ENTRY_PDA].b,
657 ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | 702 (unsigned long)pda, sizeof(*pda) - 1,
658 ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | 703 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
659 (CPU_16BIT_STACK_SIZE - 1);
660 704
661 cpu_gdt_descr->size = GDT_SIZE - 1; 705 memset(pda, 0, sizeof(*pda));
662 cpu_gdt_descr->address = (unsigned long)gdt; 706 pda->_pda = pda;
707 pda->cpu_number = cpu;
708 pda->pcurrent = idle;
709
710 return 1;
711}
712
713/* Common CPU init for both boot and secondary CPUs */
714static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
715{
716 struct tss_struct * t = &per_cpu(init_tss, cpu);
717 struct thread_struct *thread = &curr->thread;
718 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
663 719
720 /* Reinit these anyway, even if they've already been done (on
721 the boot CPU, this will transition from the boot gdt+pda to
722 the real ones). */
664 load_gdt(cpu_gdt_descr); 723 load_gdt(cpu_gdt_descr);
724 set_kernel_gs();
725
726 if (cpu_test_and_set(cpu, cpu_initialized)) {
727 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
728 for (;;) local_irq_enable();
729 }
730
731 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
732
733 if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
734 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
735 if (tsc_disable && cpu_has_tsc) {
736 printk(KERN_NOTICE "Disabling TSC...\n");
737 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
738 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
739 set_in_cr4(X86_CR4_TSD);
740 }
741
665 load_idt(&idt_descr); 742 load_idt(&idt_descr);
666 743
667 /* 744 /*
668 * Set up and load the per-CPU TSS and LDT 745 * Set up and load the per-CPU TSS and LDT
669 */ 746 */
670 atomic_inc(&init_mm.mm_count); 747 atomic_inc(&init_mm.mm_count);
671 current->active_mm = &init_mm; 748 curr->active_mm = &init_mm;
672 BUG_ON(current->mm); 749 if (curr->mm)
673 enter_lazy_tlb(&init_mm, current); 750 BUG();
751 enter_lazy_tlb(&init_mm, curr);
674 752
675 load_esp0(t, thread); 753 load_esp0(t, thread);
676 set_tss_desc(cpu,t); 754 set_tss_desc(cpu,t);
@@ -682,8 +760,8 @@ old_gdt:
682 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); 760 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
683#endif 761#endif
684 762
685 /* Clear %fs and %gs. */ 763 /* Clear %fs. */
686 asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); 764 asm volatile ("mov %0, %%fs" : : "r" (0));
687 765
688 /* Clear all 6 debug registers: */ 766 /* Clear all 6 debug registers: */
689 set_debugreg(0, 0); 767 set_debugreg(0, 0);
@@ -701,6 +779,37 @@ old_gdt:
701 mxcsr_feature_mask_init(); 779 mxcsr_feature_mask_init();
702} 780}
703 781
782/* Entrypoint to initialize secondary CPU */
783void __cpuinit secondary_cpu_init(void)
784{
785 int cpu = smp_processor_id();
786 struct task_struct *curr = current;
787
788 _cpu_init(cpu, curr);
789}
790
791/*
792 * cpu_init() initializes state that is per-CPU. Some data is already
793 * initialized (naturally) in the bootstrap process, such as the GDT
794 * and IDT. We reload them nevertheless, this function acts as a
795 * 'CPU state barrier', nothing should get across.
796 */
797void __cpuinit cpu_init(void)
798{
799 int cpu = smp_processor_id();
800 struct task_struct *curr = current;
801
802 /* Set up the real GDT and PDA, so we can transition from the
803 boot versions. */
804 if (!init_gdt(cpu, curr)) {
805 /* failed to allocate something; not much we can do... */
806 for (;;)
807 local_irq_enable();
808 }
809
810 _cpu_init(cpu, curr);
811}
812
704#ifdef CONFIG_HOTPLUG_CPU 813#ifdef CONFIG_HOTPLUG_CPU
705void __cpuinit cpu_uninit(void) 814void __cpuinit cpu_uninit(void)
706{ 815{
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 94a95aa5227e..56fe26584957 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
107 * Note that the workaround only should be initialized once... 107 * Note that the workaround only should be initialized once...
108 */ 108 */
109 c->f00f_bug = 0; 109 c->f00f_bug = 0;
110 if ( c->x86 == 5 ) { 110 if (!paravirt_enabled() && c->x86 == 5) {
111 static int f00f_workaround_enabled = 0; 111 static int f00f_workaround_enabled = 0;
112 112
113 c->f00f_bug = 1; 113 c->f00f_bug = 1;
@@ -195,8 +195,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
195 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 195 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
196 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 196 (c->x86 == 0x6 && c->x86_model >= 0x0e))
197 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); 197 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
198}
199 198
199 if (cpu_has_ds) {
200 unsigned int l1;
201 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
202 if (!(l1 & (1<<11)))
203 set_bit(X86_FEATURE_BTS, c->x86_capability);
204 if (!(l1 & (1<<12)))
205 set_bit(X86_FEATURE_PEBS, c->x86_capability);
206 }
207}
200 208
201static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) 209static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
202{ 210{
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 5c43be47587f..80b4c5d421b1 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -480,12 +480,10 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
480 if (num_cache_leaves == 0) 480 if (num_cache_leaves == 0)
481 return -ENOENT; 481 return -ENOENT;
482 482
483 cpuid4_info[cpu] = kmalloc( 483 cpuid4_info[cpu] = kzalloc(
484 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); 484 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
485 if (unlikely(cpuid4_info[cpu] == NULL)) 485 if (unlikely(cpuid4_info[cpu] == NULL))
486 return -ENOMEM; 486 return -ENOMEM;
487 memset(cpuid4_info[cpu], 0,
488 sizeof(struct _cpuid4_info) * num_cache_leaves);
489 487
490 oldmask = current->cpus_allowed; 488 oldmask = current->cpus_allowed;
491 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); 489 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -658,17 +656,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
658 return -ENOENT; 656 return -ENOENT;
659 657
660 /* Allocate all required memory */ 658 /* Allocate all required memory */
661 cache_kobject[cpu] = kmalloc(sizeof(struct kobject), GFP_KERNEL); 659 cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL);
662 if (unlikely(cache_kobject[cpu] == NULL)) 660 if (unlikely(cache_kobject[cpu] == NULL))
663 goto err_out; 661 goto err_out;
664 memset(cache_kobject[cpu], 0, sizeof(struct kobject));
665 662
666 index_kobject[cpu] = kmalloc( 663 index_kobject[cpu] = kzalloc(
667 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); 664 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
668 if (unlikely(index_kobject[cpu] == NULL)) 665 if (unlikely(index_kobject[cpu] == NULL))
669 goto err_out; 666 goto err_out;
670 memset(index_kobject[cpu], 0,
671 sizeof(struct _index_kobject) * num_cache_leaves);
672 667
673 return 0; 668 return 0;
674 669
diff --git a/arch/i386/kernel/cpu/mcheck/non-fatal.c b/arch/i386/kernel/cpu/mcheck/non-fatal.c
index 1f9153ae5b03..6b5d3518a1c0 100644
--- a/arch/i386/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/i386/kernel/cpu/mcheck/non-fatal.c
@@ -51,10 +51,10 @@ static void mce_checkregs (void *info)
51 } 51 }
52} 52}
53 53
54static void mce_work_fn(void *data); 54static void mce_work_fn(struct work_struct *work);
55static DECLARE_WORK(mce_work, mce_work_fn, NULL); 55static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
56 56
57static void mce_work_fn(void *data) 57static void mce_work_fn(struct work_struct *work)
58{ 58{
59 on_each_cpu(mce_checkregs, NULL, 1, 1); 59 on_each_cpu(mce_checkregs, NULL, 1, 1);
60 schedule_delayed_work(&mce_work, MCE_RATE); 60 schedule_delayed_work(&mce_work, MCE_RATE);
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
index 2d8703b7ce65..065005c3f168 100644
--- a/arch/i386/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -20,6 +20,7 @@
20#include <linux/cpu.h> 20#include <linux/cpu.h>
21#include <asm/cpu.h> 21#include <asm/cpu.h>
22#include <linux/notifier.h> 22#include <linux/notifier.h>
23#include <linux/jiffies.h>
23#include <asm/therm_throt.h> 24#include <asm/therm_throt.h>
24 25
25/* How long to wait between reporting thermal events */ 26/* How long to wait between reporting thermal events */
@@ -115,7 +116,6 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
115 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); 116 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
116} 117}
117 118
118#ifdef CONFIG_HOTPLUG_CPU
119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
120{ 120{
121 return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 121 return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
@@ -152,7 +152,6 @@ static struct notifier_block thermal_throttle_cpu_notifier =
152{ 152{
153 .notifier_call = thermal_throttle_cpu_callback, 153 .notifier_call = thermal_throttle_cpu_callback,
154}; 154};
155#endif /* CONFIG_HOTPLUG_CPU */
156 155
157static __init int thermal_throttle_init_device(void) 156static __init int thermal_throttle_init_device(void)
158{ 157{
diff --git a/arch/i386/kernel/cpu/mtrr/Makefile b/arch/i386/kernel/cpu/mtrr/Makefile
index a25b701ab84e..191fc0533649 100644
--- a/arch/i386/kernel/cpu/mtrr/Makefile
+++ b/arch/i386/kernel/cpu/mtrr/Makefile
@@ -1,5 +1,3 @@
1obj-y := main.o if.o generic.o state.o 1obj-y := main.o if.o generic.o state.o
2obj-y += amd.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3obj-y += cyrix.o
4obj-y += centaur.o
5 3
diff --git a/arch/i386/kernel/cpu/mtrr/amd.c b/arch/i386/kernel/cpu/mtrr/amd.c
index 1a1e04b6fd00..0949cdbf848a 100644
--- a/arch/i386/kernel/cpu/mtrr/amd.c
+++ b/arch/i386/kernel/cpu/mtrr/amd.c
@@ -7,7 +7,7 @@
7 7
8static void 8static void
9amd_get_mtrr(unsigned int reg, unsigned long *base, 9amd_get_mtrr(unsigned int reg, unsigned long *base,
10 unsigned int *size, mtrr_type * type) 10 unsigned long *size, mtrr_type * type)
11{ 11{
12 unsigned long low, high; 12 unsigned long low, high;
13 13
diff --git a/arch/i386/kernel/cpu/mtrr/centaur.c b/arch/i386/kernel/cpu/mtrr/centaur.c
index 33f00ac314ef..cb9aa3a7a7ab 100644
--- a/arch/i386/kernel/cpu/mtrr/centaur.c
+++ b/arch/i386/kernel/cpu/mtrr/centaur.c
@@ -17,7 +17,7 @@ static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
17 */ 17 */
18 18
19static int 19static int
20centaur_get_free_region(unsigned long base, unsigned long size) 20centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
21/* [SUMMARY] Get a free MTRR. 21/* [SUMMARY] Get a free MTRR.
22 <base> The starting (base) address of the region. 22 <base> The starting (base) address of the region.
23 <size> The size (in bytes) of the region. 23 <size> The size (in bytes) of the region.
@@ -26,10 +26,11 @@ centaur_get_free_region(unsigned long base, unsigned long size)
26{ 26{
27 int i, max; 27 int i, max;
28 mtrr_type ltype; 28 mtrr_type ltype;
29 unsigned long lbase; 29 unsigned long lbase, lsize;
30 unsigned int lsize;
31 30
32 max = num_var_ranges; 31 max = num_var_ranges;
32 if (replace_reg >= 0 && replace_reg < max)
33 return replace_reg;
33 for (i = 0; i < max; ++i) { 34 for (i = 0; i < max; ++i) {
34 if (centaur_mcr_reserved & (1 << i)) 35 if (centaur_mcr_reserved & (1 << i))
35 continue; 36 continue;
@@ -49,7 +50,7 @@ mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
49 50
50static void 51static void
51centaur_get_mcr(unsigned int reg, unsigned long *base, 52centaur_get_mcr(unsigned int reg, unsigned long *base,
52 unsigned int *size, mtrr_type * type) 53 unsigned long *size, mtrr_type * type)
53{ 54{
54 *base = centaur_mcr[reg].high >> PAGE_SHIFT; 55 *base = centaur_mcr[reg].high >> PAGE_SHIFT;
55 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; 56 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c
index 9027a987006b..0737a596db43 100644
--- a/arch/i386/kernel/cpu/mtrr/cyrix.c
+++ b/arch/i386/kernel/cpu/mtrr/cyrix.c
@@ -9,7 +9,7 @@ int arr3_protected;
9 9
10static void 10static void
11cyrix_get_arr(unsigned int reg, unsigned long *base, 11cyrix_get_arr(unsigned int reg, unsigned long *base,
12 unsigned int *size, mtrr_type * type) 12 unsigned long *size, mtrr_type * type)
13{ 13{
14 unsigned long flags; 14 unsigned long flags;
15 unsigned char arr, ccr3, rcr, shift; 15 unsigned char arr, ccr3, rcr, shift;
@@ -77,7 +77,7 @@ cyrix_get_arr(unsigned int reg, unsigned long *base,
77} 77}
78 78
79static int 79static int
80cyrix_get_free_region(unsigned long base, unsigned long size) 80cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
81/* [SUMMARY] Get a free ARR. 81/* [SUMMARY] Get a free ARR.
82 <base> The starting (base) address of the region. 82 <base> The starting (base) address of the region.
83 <size> The size (in bytes) of the region. 83 <size> The size (in bytes) of the region.
@@ -86,9 +86,24 @@ cyrix_get_free_region(unsigned long base, unsigned long size)
86{ 86{
87 int i; 87 int i;
88 mtrr_type ltype; 88 mtrr_type ltype;
89 unsigned long lbase; 89 unsigned long lbase, lsize;
90 unsigned int lsize;
91 90
91 switch (replace_reg) {
92 case 7:
93 if (size < 0x40)
94 break;
95 case 6:
96 case 5:
97 case 4:
98 return replace_reg;
99 case 3:
100 if (arr3_protected)
101 break;
102 case 2:
103 case 1:
104 case 0:
105 return replace_reg;
106 }
92 /* If we are to set up a region >32M then look at ARR7 immediately */ 107 /* If we are to set up a region >32M then look at ARR7 immediately */
93 if (size > 0x2000) { 108 if (size > 0x2000) {
94 cyrix_get_arr(7, &lbase, &lsize, &ltype); 109 cyrix_get_arr(7, &lbase, &lsize, &ltype);
@@ -214,7 +229,7 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
214 229
215typedef struct { 230typedef struct {
216 unsigned long base; 231 unsigned long base;
217 unsigned int size; 232 unsigned long size;
218 mtrr_type type; 233 mtrr_type type;
219} arr_state_t; 234} arr_state_t;
220 235
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index 0b61eed8bbd8..f77fc53db654 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -3,6 +3,7 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/module.h>
6#include <asm/io.h> 7#include <asm/io.h>
7#include <asm/mtrr.h> 8#include <asm/mtrr.h>
8#include <asm/msr.h> 9#include <asm/msr.h>
@@ -15,12 +16,19 @@ struct mtrr_state {
15 struct mtrr_var_range *var_ranges; 16 struct mtrr_var_range *var_ranges;
16 mtrr_type fixed_ranges[NUM_FIXED_RANGES]; 17 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
17 unsigned char enabled; 18 unsigned char enabled;
19 unsigned char have_fixed;
18 mtrr_type def_type; 20 mtrr_type def_type;
19}; 21};
20 22
21static unsigned long smp_changes_mask; 23static unsigned long smp_changes_mask;
22static struct mtrr_state mtrr_state = {}; 24static struct mtrr_state mtrr_state = {};
23 25
26#undef MODULE_PARAM_PREFIX
27#define MODULE_PARAM_PREFIX "mtrr."
28
29static __initdata int mtrr_show;
30module_param_named(show, mtrr_show, bool, 0);
31
24/* Get the MSR pair relating to a var range */ 32/* Get the MSR pair relating to a var range */
25static void __init 33static void __init
26get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) 34get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
@@ -43,6 +51,14 @@ get_fixed_ranges(mtrr_type * frs)
43 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); 51 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
44} 52}
45 53
54static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types)
55{
56 unsigned i;
57
58 for (i = 0; i < 8; ++i, ++types, base += step)
59 printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types));
60}
61
46/* Grab all of the MTRR state for this CPU into *state */ 62/* Grab all of the MTRR state for this CPU into *state */
47void __init get_mtrr_state(void) 63void __init get_mtrr_state(void)
48{ 64{
@@ -58,13 +74,49 @@ void __init get_mtrr_state(void)
58 } 74 }
59 vrs = mtrr_state.var_ranges; 75 vrs = mtrr_state.var_ranges;
60 76
77 rdmsr(MTRRcap_MSR, lo, dummy);
78 mtrr_state.have_fixed = (lo >> 8) & 1;
79
61 for (i = 0; i < num_var_ranges; i++) 80 for (i = 0; i < num_var_ranges; i++)
62 get_mtrr_var_range(i, &vrs[i]); 81 get_mtrr_var_range(i, &vrs[i]);
63 get_fixed_ranges(mtrr_state.fixed_ranges); 82 if (mtrr_state.have_fixed)
83 get_fixed_ranges(mtrr_state.fixed_ranges);
64 84
65 rdmsr(MTRRdefType_MSR, lo, dummy); 85 rdmsr(MTRRdefType_MSR, lo, dummy);
66 mtrr_state.def_type = (lo & 0xff); 86 mtrr_state.def_type = (lo & 0xff);
67 mtrr_state.enabled = (lo & 0xc00) >> 10; 87 mtrr_state.enabled = (lo & 0xc00) >> 10;
88
89 if (mtrr_show) {
90 int high_width;
91
92 printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
93 if (mtrr_state.have_fixed) {
94 printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
95 mtrr_state.enabled & 1 ? "en" : "dis");
96 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
97 for (i = 0; i < 2; ++i)
98 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
99 for (i = 0; i < 8; ++i)
100 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
101 }
102 printk(KERN_INFO "MTRR variable ranges %sabled:\n",
103 mtrr_state.enabled & 2 ? "en" : "dis");
104 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
105 for (i = 0; i < num_var_ranges; ++i) {
106 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
107 printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
108 i,
109 high_width,
110 mtrr_state.var_ranges[i].base_hi,
111 mtrr_state.var_ranges[i].base_lo >> 12,
112 high_width,
113 mtrr_state.var_ranges[i].mask_hi,
114 mtrr_state.var_ranges[i].mask_lo >> 12,
115 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
116 else
117 printk(KERN_INFO "MTRR %u disabled\n", i);
118 }
119 }
68} 120}
69 121
70/* Some BIOS's are fucked and don't set all MTRRs the same! */ 122/* Some BIOS's are fucked and don't set all MTRRs the same! */
@@ -95,7 +147,7 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
95 smp_processor_id(), msr, a, b); 147 smp_processor_id(), msr, a, b);
96} 148}
97 149
98int generic_get_free_region(unsigned long base, unsigned long size) 150int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
99/* [SUMMARY] Get a free MTRR. 151/* [SUMMARY] Get a free MTRR.
100 <base> The starting (base) address of the region. 152 <base> The starting (base) address of the region.
101 <size> The size (in bytes) of the region. 153 <size> The size (in bytes) of the region.
@@ -104,10 +156,11 @@ int generic_get_free_region(unsigned long base, unsigned long size)
104{ 156{
105 int i, max; 157 int i, max;
106 mtrr_type ltype; 158 mtrr_type ltype;
107 unsigned long lbase; 159 unsigned long lbase, lsize;
108 unsigned lsize;
109 160
110 max = num_var_ranges; 161 max = num_var_ranges;
162 if (replace_reg >= 0 && replace_reg < max)
163 return replace_reg;
111 for (i = 0; i < max; ++i) { 164 for (i = 0; i < max; ++i) {
112 mtrr_if->get(i, &lbase, &lsize, &ltype); 165 mtrr_if->get(i, &lbase, &lsize, &ltype);
113 if (lsize == 0) 166 if (lsize == 0)
@@ -117,7 +170,7 @@ int generic_get_free_region(unsigned long base, unsigned long size)
117} 170}
118 171
119static void generic_get_mtrr(unsigned int reg, unsigned long *base, 172static void generic_get_mtrr(unsigned int reg, unsigned long *base,
120 unsigned int *size, mtrr_type * type) 173 unsigned long *size, mtrr_type *type)
121{ 174{
122 unsigned int mask_lo, mask_hi, base_lo, base_hi; 175 unsigned int mask_lo, mask_hi, base_lo, base_hi;
123 176
@@ -202,7 +255,9 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
202 return changed; 255 return changed;
203} 256}
204 257
205static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) 258static u32 deftype_lo, deftype_hi;
259
260static unsigned long set_mtrr_state(void)
206/* [SUMMARY] Set the MTRR state for this CPU. 261/* [SUMMARY] Set the MTRR state for this CPU.
207 <state> The MTRR state information to read. 262 <state> The MTRR state information to read.
208 <ctxt> Some relevant CPU context. 263 <ctxt> Some relevant CPU context.
@@ -217,14 +272,14 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
217 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) 272 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
218 change_mask |= MTRR_CHANGE_MASK_VARIABLE; 273 change_mask |= MTRR_CHANGE_MASK_VARIABLE;
219 274
220 if (set_fixed_ranges(mtrr_state.fixed_ranges)) 275 if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
221 change_mask |= MTRR_CHANGE_MASK_FIXED; 276 change_mask |= MTRR_CHANGE_MASK_FIXED;
222 277
223 /* Set_mtrr_restore restores the old value of MTRRdefType, 278 /* Set_mtrr_restore restores the old value of MTRRdefType,
224 so to set it we fiddle with the saved value */ 279 so to set it we fiddle with the saved value */
225 if ((deftype_lo & 0xff) != mtrr_state.def_type 280 if ((deftype_lo & 0xff) != mtrr_state.def_type
226 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { 281 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
227 deftype_lo |= (mtrr_state.def_type | mtrr_state.enabled << 10); 282 deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10);
228 change_mask |= MTRR_CHANGE_MASK_DEFTYPE; 283 change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
229 } 284 }
230 285
@@ -233,7 +288,6 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
233 288
234 289
235static unsigned long cr4 = 0; 290static unsigned long cr4 = 0;
236static u32 deftype_lo, deftype_hi;
237static DEFINE_SPINLOCK(set_atomicity_lock); 291static DEFINE_SPINLOCK(set_atomicity_lock);
238 292
239/* 293/*
@@ -271,7 +325,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
271 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 325 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
272 326
273 /* Disable MTRRs, and set the default type to uncached */ 327 /* Disable MTRRs, and set the default type to uncached */
274 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi); 328 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
275} 329}
276 330
277static void post_set(void) __releases(set_atomicity_lock) 331static void post_set(void) __releases(set_atomicity_lock)
@@ -300,7 +354,7 @@ static void generic_set_all(void)
300 prepare_set(); 354 prepare_set();
301 355
302 /* Actually set the state */ 356 /* Actually set the state */
303 mask = set_mtrr_state(deftype_lo,deftype_hi); 357 mask = set_mtrr_state();
304 358
305 post_set(); 359 post_set();
306 local_irq_restore(flags); 360 local_irq_restore(flags);
@@ -366,7 +420,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
366 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); 420 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
367 return -EINVAL; 421 return -EINVAL;
368 } 422 }
369 if (!(base + size < 0x70000000 || base > 0x7003FFFF) && 423 if (!(base + size < 0x70000 || base > 0x7003F) &&
370 (type == MTRR_TYPE_WRCOMB 424 (type == MTRR_TYPE_WRCOMB
371 || type == MTRR_TYPE_WRBACK)) { 425 || type == MTRR_TYPE_WRBACK)) {
372 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); 426 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index 5ac051bb9d55..5ae1705eafa6 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -17,7 +17,7 @@ extern unsigned int *usage_table;
17 17
18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) 18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
19 19
20static char *mtrr_strings[MTRR_NUM_TYPES] = 20static const char *const mtrr_strings[MTRR_NUM_TYPES] =
21{ 21{
22 "uncachable", /* 0 */ 22 "uncachable", /* 0 */
23 "write-combining", /* 1 */ 23 "write-combining", /* 1 */
@@ -28,7 +28,7 @@ static char *mtrr_strings[MTRR_NUM_TYPES] =
28 "write-back", /* 6 */ 28 "write-back", /* 6 */
29}; 29};
30 30
31char *mtrr_attrib_to_str(int x) 31const char *mtrr_attrib_to_str(int x)
32{ 32{
33 return (x <= 6) ? mtrr_strings[x] : "?"; 33 return (x <= 6) ? mtrr_strings[x] : "?";
34} 34}
@@ -44,10 +44,9 @@ mtrr_file_add(unsigned long base, unsigned long size,
44 44
45 max = num_var_ranges; 45 max = num_var_ranges;
46 if (fcount == NULL) { 46 if (fcount == NULL) {
47 fcount = kmalloc(max * sizeof *fcount, GFP_KERNEL); 47 fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
48 if (!fcount) 48 if (!fcount)
49 return -ENOMEM; 49 return -ENOMEM;
50 memset(fcount, 0, max * sizeof *fcount);
51 FILE_FCOUNT(file) = fcount; 50 FILE_FCOUNT(file) = fcount;
52 } 51 }
53 if (!page) { 52 if (!page) {
@@ -155,6 +154,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
155{ 154{
156 int err = 0; 155 int err = 0;
157 mtrr_type type; 156 mtrr_type type;
157 unsigned long size;
158 struct mtrr_sentry sentry; 158 struct mtrr_sentry sentry;
159 struct mtrr_gentry gentry; 159 struct mtrr_gentry gentry;
160 void __user *arg = (void __user *) __arg; 160 void __user *arg = (void __user *) __arg;
@@ -235,15 +235,15 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
235 case MTRRIOC_GET_ENTRY: 235 case MTRRIOC_GET_ENTRY:
236 if (gentry.regnum >= num_var_ranges) 236 if (gentry.regnum >= num_var_ranges)
237 return -EINVAL; 237 return -EINVAL;
238 mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); 238 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
239 239
240 /* Hide entries that go above 4GB */ 240 /* Hide entries that go above 4GB */
241 if (gentry.base + gentry.size > 0x100000 241 if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
242 || gentry.size == 0x100000) 242 || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
243 gentry.base = gentry.size = gentry.type = 0; 243 gentry.base = gentry.size = gentry.type = 0;
244 else { 244 else {
245 gentry.base <<= PAGE_SHIFT; 245 gentry.base <<= PAGE_SHIFT;
246 gentry.size <<= PAGE_SHIFT; 246 gentry.size = size << PAGE_SHIFT;
247 gentry.type = type; 247 gentry.type = type;
248 } 248 }
249 249
@@ -273,8 +273,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
273 case MTRRIOC_GET_PAGE_ENTRY: 273 case MTRRIOC_GET_PAGE_ENTRY:
274 if (gentry.regnum >= num_var_ranges) 274 if (gentry.regnum >= num_var_ranges)
275 return -EINVAL; 275 return -EINVAL;
276 mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); 276 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
277 gentry.type = type; 277 /* Hide entries that would overflow */
278 if (size != (__typeof__(gentry.size))size)
279 gentry.base = gentry.size = gentry.type = 0;
280 else {
281 gentry.size = size;
282 gentry.type = type;
283 }
278 break; 284 break;
279 } 285 }
280 286
@@ -353,8 +359,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
353 char factor; 359 char factor;
354 int i, max, len; 360 int i, max, len;
355 mtrr_type type; 361 mtrr_type type;
356 unsigned long base; 362 unsigned long base, size;
357 unsigned int size;
358 363
359 len = 0; 364 len = 0;
360 max = num_var_ranges; 365 max = num_var_ranges;
@@ -373,7 +378,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
373 } 378 }
374 /* RED-PEN: base can be > 32bit */ 379 /* RED-PEN: base can be > 32bit */
375 len += seq_printf(seq, 380 len += seq_printf(seq,
376 "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n", 381 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
377 i, base, base >> (20 - PAGE_SHIFT), size, factor, 382 i, base, base >> (20 - PAGE_SHIFT), size, factor,
378 mtrr_attrib_to_str(type), usage_table[i]); 383 mtrr_attrib_to_str(type), usage_table[i]);
379 } 384 }
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index fff90bda4733..16bb7ea87145 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -59,7 +59,11 @@ struct mtrr_ops * mtrr_if = NULL;
59static void set_mtrr(unsigned int reg, unsigned long base, 59static void set_mtrr(unsigned int reg, unsigned long base,
60 unsigned long size, mtrr_type type); 60 unsigned long size, mtrr_type type);
61 61
62#ifndef CONFIG_X86_64
62extern int arr3_protected; 63extern int arr3_protected;
64#else
65#define arr3_protected 0
66#endif
63 67
64void set_mtrr_ops(struct mtrr_ops * ops) 68void set_mtrr_ops(struct mtrr_ops * ops)
65{ 69{
@@ -168,6 +172,13 @@ static void ipi_handler(void *info)
168 172
169#endif 173#endif
170 174
175static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
176 return type1 == MTRR_TYPE_UNCACHABLE ||
177 type2 == MTRR_TYPE_UNCACHABLE ||
178 (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
179 (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
180}
181
171/** 182/**
172 * set_mtrr - update mtrrs on all processors 183 * set_mtrr - update mtrrs on all processors
173 * @reg: mtrr in question 184 * @reg: mtrr in question
@@ -263,8 +274,8 @@ static void set_mtrr(unsigned int reg, unsigned long base,
263 274
264/** 275/**
265 * mtrr_add_page - Add a memory type region 276 * mtrr_add_page - Add a memory type region
266 * @base: Physical base address of region in pages (4 KB) 277 * @base: Physical base address of region in pages (in units of 4 kB!)
267 * @size: Physical size of region in pages (4 KB) 278 * @size: Physical size of region in pages (4 kB)
268 * @type: Type of MTRR desired 279 * @type: Type of MTRR desired
269 * @increment: If this is true do usage counting on the region 280 * @increment: If this is true do usage counting on the region
270 * 281 *
@@ -300,11 +311,9 @@ static void set_mtrr(unsigned int reg, unsigned long base,
300int mtrr_add_page(unsigned long base, unsigned long size, 311int mtrr_add_page(unsigned long base, unsigned long size,
301 unsigned int type, char increment) 312 unsigned int type, char increment)
302{ 313{
303 int i; 314 int i, replace, error;
304 mtrr_type ltype; 315 mtrr_type ltype;
305 unsigned long lbase; 316 unsigned long lbase, lsize;
306 unsigned int lsize;
307 int error;
308 317
309 if (!mtrr_if) 318 if (!mtrr_if)
310 return -ENXIO; 319 return -ENXIO;
@@ -324,12 +333,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
324 return -ENOSYS; 333 return -ENOSYS;
325 } 334 }
326 335
336 if (!size) {
337 printk(KERN_WARNING "mtrr: zero sized request\n");
338 return -EINVAL;
339 }
340
327 if (base & size_or_mask || size & size_or_mask) { 341 if (base & size_or_mask || size & size_or_mask) {
328 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); 342 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n");
329 return -EINVAL; 343 return -EINVAL;
330 } 344 }
331 345
332 error = -EINVAL; 346 error = -EINVAL;
347 replace = -1;
333 348
334 /* No CPU hotplug when we change MTRR entries */ 349 /* No CPU hotplug when we change MTRR entries */
335 lock_cpu_hotplug(); 350 lock_cpu_hotplug();
@@ -337,21 +352,28 @@ int mtrr_add_page(unsigned long base, unsigned long size,
337 mutex_lock(&mtrr_mutex); 352 mutex_lock(&mtrr_mutex);
338 for (i = 0; i < num_var_ranges; ++i) { 353 for (i = 0; i < num_var_ranges; ++i) {
339 mtrr_if->get(i, &lbase, &lsize, &ltype); 354 mtrr_if->get(i, &lbase, &lsize, &ltype);
340 if (base >= lbase + lsize) 355 if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase)
341 continue;
342 if ((base < lbase) && (base + size <= lbase))
343 continue; 356 continue;
344 /* At this point we know there is some kind of overlap/enclosure */ 357 /* At this point we know there is some kind of overlap/enclosure */
345 if ((base < lbase) || (base + size > lbase + lsize)) { 358 if (base < lbase || base + size - 1 > lbase + lsize - 1) {
359 if (base <= lbase && base + size - 1 >= lbase + lsize - 1) {
360 /* New region encloses an existing region */
361 if (type == ltype) {
362 replace = replace == -1 ? i : -2;
363 continue;
364 }
365 else if (types_compatible(type, ltype))
366 continue;
367 }
346 printk(KERN_WARNING 368 printk(KERN_WARNING
347 "mtrr: 0x%lx000,0x%lx000 overlaps existing" 369 "mtrr: 0x%lx000,0x%lx000 overlaps existing"
348 " 0x%lx000,0x%x000\n", base, size, lbase, 370 " 0x%lx000,0x%lx000\n", base, size, lbase,
349 lsize); 371 lsize);
350 goto out; 372 goto out;
351 } 373 }
352 /* New region is enclosed by an existing region */ 374 /* New region is enclosed by an existing region */
353 if (ltype != type) { 375 if (ltype != type) {
354 if (type == MTRR_TYPE_UNCACHABLE) 376 if (types_compatible(type, ltype))
355 continue; 377 continue;
356 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", 378 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
357 base, size, mtrr_attrib_to_str(ltype), 379 base, size, mtrr_attrib_to_str(ltype),
@@ -364,10 +386,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
364 goto out; 386 goto out;
365 } 387 }
366 /* Search for an empty MTRR */ 388 /* Search for an empty MTRR */
367 i = mtrr_if->get_free_region(base, size); 389 i = mtrr_if->get_free_region(base, size, replace);
368 if (i >= 0) { 390 if (i >= 0) {
369 set_mtrr(i, base, size, type); 391 set_mtrr(i, base, size, type);
370 usage_table[i] = 1; 392 if (likely(replace < 0))
393 usage_table[i] = 1;
394 else {
395 usage_table[i] = usage_table[replace] + !!increment;
396 if (unlikely(replace != i)) {
397 set_mtrr(replace, 0, 0, 0);
398 usage_table[replace] = 0;
399 }
400 }
371 } else 401 } else
372 printk(KERN_INFO "mtrr: no more MTRRs available\n"); 402 printk(KERN_INFO "mtrr: no more MTRRs available\n");
373 error = i; 403 error = i;
@@ -455,8 +485,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
455{ 485{
456 int i, max; 486 int i, max;
457 mtrr_type ltype; 487 mtrr_type ltype;
458 unsigned long lbase; 488 unsigned long lbase, lsize;
459 unsigned int lsize;
460 int error = -EINVAL; 489 int error = -EINVAL;
461 490
462 if (!mtrr_if) 491 if (!mtrr_if)
@@ -544,9 +573,11 @@ extern void centaur_init_mtrr(void);
544 573
545static void __init init_ifs(void) 574static void __init init_ifs(void)
546{ 575{
576#ifndef CONFIG_X86_64
547 amd_init_mtrr(); 577 amd_init_mtrr();
548 cyrix_init_mtrr(); 578 cyrix_init_mtrr();
549 centaur_init_mtrr(); 579 centaur_init_mtrr();
580#endif
550} 581}
551 582
552/* The suspend/resume methods are only for CPU without MTRR. CPU using generic 583/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
@@ -555,7 +586,7 @@ static void __init init_ifs(void)
555struct mtrr_value { 586struct mtrr_value {
556 mtrr_type ltype; 587 mtrr_type ltype;
557 unsigned long lbase; 588 unsigned long lbase;
558 unsigned int lsize; 589 unsigned long lsize;
559}; 590};
560 591
561static struct mtrr_value * mtrr_state; 592static struct mtrr_value * mtrr_state;
@@ -565,10 +596,8 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
565 int i; 596 int i;
566 int size = num_var_ranges * sizeof(struct mtrr_value); 597 int size = num_var_ranges * sizeof(struct mtrr_value);
567 598
568 mtrr_state = kmalloc(size,GFP_ATOMIC); 599 mtrr_state = kzalloc(size,GFP_ATOMIC);
569 if (mtrr_state) 600 if (!mtrr_state)
570 memset(mtrr_state,0,size);
571 else
572 return -ENOMEM; 601 return -ENOMEM;
573 602
574 for (i = 0; i < num_var_ranges; i++) { 603 for (i = 0; i < num_var_ranges; i++) {
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index 99c9f2682041..d61ea9db6cfe 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -43,15 +43,16 @@ struct mtrr_ops {
43 void (*set_all)(void); 43 void (*set_all)(void);
44 44
45 void (*get)(unsigned int reg, unsigned long *base, 45 void (*get)(unsigned int reg, unsigned long *base,
46 unsigned int *size, mtrr_type * type); 46 unsigned long *size, mtrr_type * type);
47 int (*get_free_region) (unsigned long base, unsigned long size); 47 int (*get_free_region)(unsigned long base, unsigned long size,
48 48 int replace_reg);
49 int (*validate_add_page)(unsigned long base, unsigned long size, 49 int (*validate_add_page)(unsigned long base, unsigned long size,
50 unsigned int type); 50 unsigned int type);
51 int (*have_wrcomb)(void); 51 int (*have_wrcomb)(void);
52}; 52};
53 53
54extern int generic_get_free_region(unsigned long base, unsigned long size); 54extern int generic_get_free_region(unsigned long base, unsigned long size,
55 int replace_reg);
55extern int generic_validate_add_page(unsigned long base, unsigned long size, 56extern int generic_validate_add_page(unsigned long base, unsigned long size,
56 unsigned int type); 57 unsigned int type);
57 58
@@ -62,17 +63,17 @@ extern int positive_have_wrcomb(void);
62/* library functions for processor-specific routines */ 63/* library functions for processor-specific routines */
63struct set_mtrr_context { 64struct set_mtrr_context {
64 unsigned long flags; 65 unsigned long flags;
65 unsigned long deftype_lo;
66 unsigned long deftype_hi;
67 unsigned long cr4val; 66 unsigned long cr4val;
68 unsigned long ccr3; 67 u32 deftype_lo;
68 u32 deftype_hi;
69 u32 ccr3;
69}; 70};
70 71
71struct mtrr_var_range { 72struct mtrr_var_range {
72 unsigned long base_lo; 73 u32 base_lo;
73 unsigned long base_hi; 74 u32 base_hi;
74 unsigned long mask_lo; 75 u32 mask_lo;
75 unsigned long mask_hi; 76 u32 mask_hi;
76}; 77};
77 78
78void set_mtrr_done(struct set_mtrr_context *ctxt); 79void set_mtrr_done(struct set_mtrr_context *ctxt);
@@ -92,6 +93,6 @@ extern struct mtrr_ops * mtrr_if;
92extern unsigned int num_var_ranges; 93extern unsigned int num_var_ranges;
93 94
94void mtrr_state_warn(void); 95void mtrr_state_warn(void);
95char *mtrr_attrib_to_str(int x); 96const char *mtrr_attrib_to_str(int x);
96void mtrr_wrmsr(unsigned, unsigned, unsigned); 97void mtrr_wrmsr(unsigned, unsigned, unsigned);
97 98
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 76aac088a323..6624d8583c42 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -152,9 +152,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
152 seq_printf(m, " [%d]", i); 152 seq_printf(m, " [%d]", i);
153 } 153 }
154 154
155 seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n", 155 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
156 c->loops_per_jiffy/(500000/HZ), 156 c->loops_per_jiffy/(500000/HZ),
157 (c->loops_per_jiffy/(5000/HZ)) % 100); 157 (c->loops_per_jiffy/(5000/HZ)) % 100);
158 seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size);
158 159
159 return 0; 160 return 0;
160} 161}
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index fde8bea85cee..51130b39cd2e 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -34,7 +34,6 @@
34#include <linux/major.h> 34#include <linux/major.h>
35#include <linux/fs.h> 35#include <linux/fs.h>
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/fs.h>
38#include <linux/device.h> 37#include <linux/device.h>
39#include <linux/cpu.h> 38#include <linux/cpu.h>
40#include <linux/notifier.h> 39#include <linux/notifier.h>
@@ -117,7 +116,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
117 char __user *tmp = buf; 116 char __user *tmp = buf;
118 u32 data[4]; 117 u32 data[4];
119 u32 reg = *ppos; 118 u32 reg = *ppos;
120 int cpu = iminor(file->f_dentry->d_inode); 119 int cpu = iminor(file->f_path.dentry->d_inode);
121 120
122 if (count % 16) 121 if (count % 16)
123 return -EINVAL; /* Invalid chunk size */ 122 return -EINVAL; /* Invalid chunk size */
@@ -135,7 +134,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
135 134
136static int cpuid_open(struct inode *inode, struct file *file) 135static int cpuid_open(struct inode *inode, struct file *file)
137{ 136{
138 unsigned int cpu = iminor(file->f_dentry->d_inode); 137 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
139 struct cpuinfo_x86 *c = &(cpu_data)[cpu]; 138 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
140 139
141 if (cpu >= NR_CPUS || !cpu_online(cpu)) 140 if (cpu >= NR_CPUS || !cpu_online(cpu))
@@ -156,28 +155,27 @@ static struct file_operations cpuid_fops = {
156 .open = cpuid_open, 155 .open = cpuid_open,
157}; 156};
158 157
159static int cpuid_class_device_create(int i) 158static int cpuid_device_create(int i)
160{ 159{
161 int err = 0; 160 int err = 0;
162 struct class_device *class_err; 161 struct device *dev;
163 162
164 class_err = class_device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), NULL, "cpu%d",i); 163 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i);
165 if (IS_ERR(class_err)) 164 if (IS_ERR(dev))
166 err = PTR_ERR(class_err); 165 err = PTR_ERR(dev);
167 return err; 166 return err;
168} 167}
169 168
170#ifdef CONFIG_HOTPLUG_CPU
171static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 169static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
172{ 170{
173 unsigned int cpu = (unsigned long)hcpu; 171 unsigned int cpu = (unsigned long)hcpu;
174 172
175 switch (action) { 173 switch (action) {
176 case CPU_ONLINE: 174 case CPU_ONLINE:
177 cpuid_class_device_create(cpu); 175 cpuid_device_create(cpu);
178 break; 176 break;
179 case CPU_DEAD: 177 case CPU_DEAD:
180 class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); 178 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
181 break; 179 break;
182 } 180 }
183 return NOTIFY_OK; 181 return NOTIFY_OK;
@@ -187,7 +185,6 @@ static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
187{ 185{
188 .notifier_call = cpuid_class_cpu_callback, 186 .notifier_call = cpuid_class_cpu_callback,
189}; 187};
190#endif /* !CONFIG_HOTPLUG_CPU */
191 188
192static int __init cpuid_init(void) 189static int __init cpuid_init(void)
193{ 190{
@@ -206,7 +203,7 @@ static int __init cpuid_init(void)
206 goto out_chrdev; 203 goto out_chrdev;
207 } 204 }
208 for_each_online_cpu(i) { 205 for_each_online_cpu(i) {
209 err = cpuid_class_device_create(i); 206 err = cpuid_device_create(i);
210 if (err != 0) 207 if (err != 0)
211 goto out_class; 208 goto out_class;
212 } 209 }
@@ -218,7 +215,7 @@ static int __init cpuid_init(void)
218out_class: 215out_class:
219 i = 0; 216 i = 0;
220 for_each_online_cpu(i) { 217 for_each_online_cpu(i) {
221 class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i)); 218 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i));
222 } 219 }
223 class_destroy(cpuid_class); 220 class_destroy(cpuid_class);
224out_chrdev: 221out_chrdev:
@@ -232,7 +229,7 @@ static void __exit cpuid_exit(void)
232 int cpu = 0; 229 int cpu = 0;
233 230
234 for_each_online_cpu(cpu) 231 for_each_online_cpu(cpu)
235 class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); 232 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
236 class_destroy(cpuid_class); 233 class_destroy(cpuid_class);
237 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 234 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
238 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 235 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 144b43288965..a5e0e990ea95 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -31,68 +31,6 @@
31/* This keeps a track of which one is crashing cpu. */ 31/* This keeps a track of which one is crashing cpu. */
32static int crashing_cpu; 32static int crashing_cpu;
33 33
34static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
35 size_t data_len)
36{
37 struct elf_note note;
38
39 note.n_namesz = strlen(name) + 1;
40 note.n_descsz = data_len;
41 note.n_type = type;
42 memcpy(buf, &note, sizeof(note));
43 buf += (sizeof(note) +3)/4;
44 memcpy(buf, name, note.n_namesz);
45 buf += (note.n_namesz + 3)/4;
46 memcpy(buf, data, note.n_descsz);
47 buf += (note.n_descsz + 3)/4;
48
49 return buf;
50}
51
52static void final_note(u32 *buf)
53{
54 struct elf_note note;
55
56 note.n_namesz = 0;
57 note.n_descsz = 0;
58 note.n_type = 0;
59 memcpy(buf, &note, sizeof(note));
60}
61
62static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
63{
64 struct elf_prstatus prstatus;
65 u32 *buf;
66
67 if ((cpu < 0) || (cpu >= NR_CPUS))
68 return;
69
70 /* Using ELF notes here is opportunistic.
71 * I need a well defined structure format
72 * for the data I pass, and I need tags
73 * on the data to indicate what information I have
74 * squirrelled away. ELF notes happen to provide
75 * all of that, so there is no need to invent something new.
76 */
77 buf = (u32*)per_cpu_ptr(crash_notes, cpu);
78 if (!buf)
79 return;
80 memset(&prstatus, 0, sizeof(prstatus));
81 prstatus.pr_pid = current->pid;
82 elf_core_copy_regs(&prstatus.pr_reg, regs);
83 buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
84 sizeof(prstatus));
85 final_note(buf);
86}
87
88static void crash_save_self(struct pt_regs *regs)
89{
90 int cpu;
91
92 cpu = safe_smp_processor_id();
93 crash_save_this_cpu(regs, cpu);
94}
95
96#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 34#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
97static atomic_t waiting_for_crash_ipi; 35static atomic_t waiting_for_crash_ipi;
98 36
@@ -121,7 +59,7 @@ static int crash_nmi_callback(struct notifier_block *self,
121 crash_fixup_ss_esp(&fixed_regs, regs); 59 crash_fixup_ss_esp(&fixed_regs, regs);
122 regs = &fixed_regs; 60 regs = &fixed_regs;
123 } 61 }
124 crash_save_this_cpu(regs, cpu); 62 crash_save_cpu(regs, cpu);
125 disable_local_APIC(); 63 disable_local_APIC();
126 atomic_dec(&waiting_for_crash_ipi); 64 atomic_dec(&waiting_for_crash_ipi);
127 /* Assume hlt works */ 65 /* Assume hlt works */
@@ -195,5 +133,5 @@ void machine_crash_shutdown(struct pt_regs *regs)
195#if defined(CONFIG_X86_IO_APIC) 133#if defined(CONFIG_X86_IO_APIC)
196 disable_IO_APIC(); 134 disable_IO_APIC();
197#endif 135#endif
198 crash_save_self(regs); 136 crash_save_cpu(regs, safe_smp_processor_id());
199} 137}
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
new file mode 100644
index 000000000000..2f7d0a92fd7c
--- /dev/null
+++ b/arch/i386/kernel/e820.c
@@ -0,0 +1,894 @@
1#include <linux/kernel.h>
2#include <linux/types.h>
3#include <linux/init.h>
4#include <linux/bootmem.h>
5#include <linux/ioport.h>
6#include <linux/string.h>
7#include <linux/kexec.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/efi.h>
11#include <linux/pfn.h>
12#include <linux/uaccess.h>
13
14#include <asm/pgtable.h>
15#include <asm/page.h>
16#include <asm/e820.h>
17
18#ifdef CONFIG_EFI
19int efi_enabled = 0;
20EXPORT_SYMBOL(efi_enabled);
21#endif
22
23struct e820map e820;
24struct change_member {
25 struct e820entry *pbios; /* pointer to original bios entry */
26 unsigned long long addr; /* address for this change point */
27};
28static struct change_member change_point_list[2*E820MAX] __initdata;
29static struct change_member *change_point[2*E820MAX] __initdata;
30static struct e820entry *overlap_list[E820MAX] __initdata;
31static struct e820entry new_bios[E820MAX] __initdata;
32/* For PCI or other memory-mapped resources */
33unsigned long pci_mem_start = 0x10000000;
34#ifdef CONFIG_PCI
35EXPORT_SYMBOL(pci_mem_start);
36#endif
37extern int user_defined_memmap;
38struct resource data_resource = {
39 .name = "Kernel data",
40 .start = 0,
41 .end = 0,
42 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
43};
44
45struct resource code_resource = {
46 .name = "Kernel code",
47 .start = 0,
48 .end = 0,
49 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
50};
51
52static struct resource system_rom_resource = {
53 .name = "System ROM",
54 .start = 0xf0000,
55 .end = 0xfffff,
56 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
57};
58
59static struct resource extension_rom_resource = {
60 .name = "Extension ROM",
61 .start = 0xe0000,
62 .end = 0xeffff,
63 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
64};
65
66static struct resource adapter_rom_resources[] = { {
67 .name = "Adapter ROM",
68 .start = 0xc8000,
69 .end = 0,
70 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
71}, {
72 .name = "Adapter ROM",
73 .start = 0,
74 .end = 0,
75 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
76}, {
77 .name = "Adapter ROM",
78 .start = 0,
79 .end = 0,
80 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
81}, {
82 .name = "Adapter ROM",
83 .start = 0,
84 .end = 0,
85 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
86}, {
87 .name = "Adapter ROM",
88 .start = 0,
89 .end = 0,
90 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
91}, {
92 .name = "Adapter ROM",
93 .start = 0,
94 .end = 0,
95 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
96} };
97
98static struct resource video_rom_resource = {
99 .name = "Video ROM",
100 .start = 0xc0000,
101 .end = 0xc7fff,
102 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
103};
104
105static struct resource video_ram_resource = {
106 .name = "Video RAM area",
107 .start = 0xa0000,
108 .end = 0xbffff,
109 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
110};
111
112static struct resource standard_io_resources[] = { {
113 .name = "dma1",
114 .start = 0x0000,
115 .end = 0x001f,
116 .flags = IORESOURCE_BUSY | IORESOURCE_IO
117}, {
118 .name = "pic1",
119 .start = 0x0020,
120 .end = 0x0021,
121 .flags = IORESOURCE_BUSY | IORESOURCE_IO
122}, {
123 .name = "timer0",
124 .start = 0x0040,
125 .end = 0x0043,
126 .flags = IORESOURCE_BUSY | IORESOURCE_IO
127}, {
128 .name = "timer1",
129 .start = 0x0050,
130 .end = 0x0053,
131 .flags = IORESOURCE_BUSY | IORESOURCE_IO
132}, {
133 .name = "keyboard",
134 .start = 0x0060,
135 .end = 0x006f,
136 .flags = IORESOURCE_BUSY | IORESOURCE_IO
137}, {
138 .name = "dma page reg",
139 .start = 0x0080,
140 .end = 0x008f,
141 .flags = IORESOURCE_BUSY | IORESOURCE_IO
142}, {
143 .name = "pic2",
144 .start = 0x00a0,
145 .end = 0x00a1,
146 .flags = IORESOURCE_BUSY | IORESOURCE_IO
147}, {
148 .name = "dma2",
149 .start = 0x00c0,
150 .end = 0x00df,
151 .flags = IORESOURCE_BUSY | IORESOURCE_IO
152}, {
153 .name = "fpu",
154 .start = 0x00f0,
155 .end = 0x00ff,
156 .flags = IORESOURCE_BUSY | IORESOURCE_IO
157} };
158
159static int romsignature(const unsigned char *x)
160{
161 unsigned short sig;
162 int ret = 0;
163 if (probe_kernel_address((const unsigned short *)x, sig) == 0)
164 ret = (sig == 0xaa55);
165 return ret;
166}
167
168static int __init romchecksum(unsigned char *rom, unsigned long length)
169{
170 unsigned char *p, sum = 0;
171
172 for (p = rom; p < rom + length; p++)
173 sum += *p;
174 return sum == 0;
175}
176
177static void __init probe_roms(void)
178{
179 unsigned long start, length, upper;
180 unsigned char *rom;
181 int i;
182
183 /* video rom */
184 upper = adapter_rom_resources[0].start;
185 for (start = video_rom_resource.start; start < upper; start += 2048) {
186 rom = isa_bus_to_virt(start);
187 if (!romsignature(rom))
188 continue;
189
190 video_rom_resource.start = start;
191
192 /* 0 < length <= 0x7f * 512, historically */
193 length = rom[2] * 512;
194
195 /* if checksum okay, trust length byte */
196 if (length && romchecksum(rom, length))
197 video_rom_resource.end = start + length - 1;
198
199 request_resource(&iomem_resource, &video_rom_resource);
200 break;
201 }
202
203 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
204 if (start < upper)
205 start = upper;
206
207 /* system rom */
208 request_resource(&iomem_resource, &system_rom_resource);
209 upper = system_rom_resource.start;
210
211 /* check for extension rom (ignore length byte!) */
212 rom = isa_bus_to_virt(extension_rom_resource.start);
213 if (romsignature(rom)) {
214 length = extension_rom_resource.end - extension_rom_resource.start + 1;
215 if (romchecksum(rom, length)) {
216 request_resource(&iomem_resource, &extension_rom_resource);
217 upper = extension_rom_resource.start;
218 }
219 }
220
221 /* check for adapter roms on 2k boundaries */
222 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
223 rom = isa_bus_to_virt(start);
224 if (!romsignature(rom))
225 continue;
226
227 /* 0 < length <= 0x7f * 512, historically */
228 length = rom[2] * 512;
229
230 /* but accept any length that fits if checksum okay */
231 if (!length || start + length > upper || !romchecksum(rom, length))
232 continue;
233
234 adapter_rom_resources[i].start = start;
235 adapter_rom_resources[i].end = start + length - 1;
236 request_resource(&iomem_resource, &adapter_rom_resources[i]);
237
238 start = adapter_rom_resources[i++].end & ~2047UL;
239 }
240}
241
242/*
243 * Request address space for all standard RAM and ROM resources
244 * and also for regions reported as reserved by the e820.
245 */
246static void __init
247legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
248{
249 int i;
250
251 probe_roms();
252 for (i = 0; i < e820.nr_map; i++) {
253 struct resource *res;
254#ifndef CONFIG_RESOURCES_64BIT
255 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
256 continue;
257#endif
258 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
259 switch (e820.map[i].type) {
260 case E820_RAM: res->name = "System RAM"; break;
261 case E820_ACPI: res->name = "ACPI Tables"; break;
262 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
263 default: res->name = "reserved";
264 }
265 res->start = e820.map[i].addr;
266 res->end = res->start + e820.map[i].size - 1;
267 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
268 if (request_resource(&iomem_resource, res)) {
269 kfree(res);
270 continue;
271 }
272 if (e820.map[i].type == E820_RAM) {
273 /*
274 * We don't know which RAM region contains kernel data,
275 * so we try it repeatedly and let the resource manager
276 * test it.
277 */
278 request_resource(res, code_resource);
279 request_resource(res, data_resource);
280#ifdef CONFIG_KEXEC
281 request_resource(res, &crashk_res);
282#endif
283 }
284 }
285}
286
287/*
288 * Request address space for all standard resources
289 *
290 * This is called just before pcibios_init(), which is also a
291 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
292 */
293static int __init request_standard_resources(void)
294{
295 int i;
296
297 printk("Setting up standard PCI resources\n");
298 if (efi_enabled)
299 efi_initialize_iomem_resources(&code_resource, &data_resource);
300 else
301 legacy_init_iomem_resources(&code_resource, &data_resource);
302
303 /* EFI systems may still have VGA */
304 request_resource(&iomem_resource, &video_ram_resource);
305
306 /* request I/O space for devices used on all i[345]86 PCs */
307 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
308 request_resource(&ioport_resource, &standard_io_resources[i]);
309 return 0;
310}
311
312subsys_initcall(request_standard_resources);
313
314void __init add_memory_region(unsigned long long start,
315 unsigned long long size, int type)
316{
317 int x;
318
319 if (!efi_enabled) {
320 x = e820.nr_map;
321
322 if (x == E820MAX) {
323 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
324 return;
325 }
326
327 e820.map[x].addr = start;
328 e820.map[x].size = size;
329 e820.map[x].type = type;
330 e820.nr_map++;
331 }
332} /* add_memory_region */
333
334/*
335 * Sanitize the BIOS e820 map.
336 *
337 * Some e820 responses include overlapping entries. The following
338 * replaces the original e820 map with a new one, removing overlaps.
339 *
340 */
341int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
342{
343 struct change_member *change_tmp;
344 unsigned long current_type, last_type;
345 unsigned long long last_addr;
346 int chgidx, still_changing;
347 int overlap_entries;
348 int new_bios_entry;
349 int old_nr, new_nr, chg_nr;
350 int i;
351
352 /*
353 Visually we're performing the following (1,2,3,4 = memory types)...
354
355 Sample memory map (w/overlaps):
356 ____22__________________
357 ______________________4_
358 ____1111________________
359 _44_____________________
360 11111111________________
361 ____________________33__
362 ___________44___________
363 __________33333_________
364 ______________22________
365 ___________________2222_
366 _________111111111______
367 _____________________11_
368 _________________4______
369
370 Sanitized equivalent (no overlap):
371 1_______________________
372 _44_____________________
373 ___1____________________
374 ____22__________________
375 ______11________________
376 _________1______________
377 __________3_____________
378 ___________44___________
379 _____________33_________
380 _______________2________
381 ________________1_______
382 _________________4______
383 ___________________2____
384 ____________________33__
385 ______________________4_
386 */
387 printk("sanitize start\n");
388 /* if there's only one memory region, don't bother */
389 if (*pnr_map < 2) {
390 printk("sanitize bail 0\n");
391 return -1;
392 }
393
394 old_nr = *pnr_map;
395
396 /* bail out if we find any unreasonable addresses in bios map */
397 for (i=0; i<old_nr; i++)
398 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
399 printk("sanitize bail 1\n");
400 return -1;
401 }
402
403 /* create pointers for initial change-point information (for sorting) */
404 for (i=0; i < 2*old_nr; i++)
405 change_point[i] = &change_point_list[i];
406
407 /* record all known change-points (starting and ending addresses),
408 omitting those that are for empty memory regions */
409 chgidx = 0;
410 for (i=0; i < old_nr; i++) {
411 if (biosmap[i].size != 0) {
412 change_point[chgidx]->addr = biosmap[i].addr;
413 change_point[chgidx++]->pbios = &biosmap[i];
414 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
415 change_point[chgidx++]->pbios = &biosmap[i];
416 }
417 }
418 chg_nr = chgidx; /* true number of change-points */
419
420 /* sort change-point list by memory addresses (low -> high) */
421 still_changing = 1;
422 while (still_changing) {
423 still_changing = 0;
424 for (i=1; i < chg_nr; i++) {
425 /* if <current_addr> > <last_addr>, swap */
426 /* or, if current=<start_addr> & last=<end_addr>, swap */
427 if ((change_point[i]->addr < change_point[i-1]->addr) ||
428 ((change_point[i]->addr == change_point[i-1]->addr) &&
429 (change_point[i]->addr == change_point[i]->pbios->addr) &&
430 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
431 )
432 {
433 change_tmp = change_point[i];
434 change_point[i] = change_point[i-1];
435 change_point[i-1] = change_tmp;
436 still_changing=1;
437 }
438 }
439 }
440
441 /* create a new bios memory map, removing overlaps */
442 overlap_entries=0; /* number of entries in the overlap table */
443 new_bios_entry=0; /* index for creating new bios map entries */
444 last_type = 0; /* start with undefined memory type */
445 last_addr = 0; /* start with 0 as last starting address */
446 /* loop through change-points, determining affect on the new bios map */
447 for (chgidx=0; chgidx < chg_nr; chgidx++)
448 {
449 /* keep track of all overlapping bios entries */
450 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
451 {
452 /* add map entry to overlap list (> 1 entry implies an overlap) */
453 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
454 }
455 else
456 {
457 /* remove entry from list (order independent, so swap with last) */
458 for (i=0; i<overlap_entries; i++)
459 {
460 if (overlap_list[i] == change_point[chgidx]->pbios)
461 overlap_list[i] = overlap_list[overlap_entries-1];
462 }
463 overlap_entries--;
464 }
465 /* if there are overlapping entries, decide which "type" to use */
466 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
467 current_type = 0;
468 for (i=0; i<overlap_entries; i++)
469 if (overlap_list[i]->type > current_type)
470 current_type = overlap_list[i]->type;
471 /* continue building up new bios map based on this information */
472 if (current_type != last_type) {
473 if (last_type != 0) {
474 new_bios[new_bios_entry].size =
475 change_point[chgidx]->addr - last_addr;
476 /* move forward only if the new size was non-zero */
477 if (new_bios[new_bios_entry].size != 0)
478 if (++new_bios_entry >= E820MAX)
479 break; /* no more space left for new bios entries */
480 }
481 if (current_type != 0) {
482 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
483 new_bios[new_bios_entry].type = current_type;
484 last_addr=change_point[chgidx]->addr;
485 }
486 last_type = current_type;
487 }
488 }
489 new_nr = new_bios_entry; /* retain count for new bios entries */
490
491 /* copy new bios mapping into original location */
492 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
493 *pnr_map = new_nr;
494
495 printk("sanitize end\n");
496 return 0;
497}
498
499/*
500 * Copy the BIOS e820 map into a safe place.
501 *
502 * Sanity-check it while we're at it..
503 *
504 * If we're lucky and live on a modern system, the setup code
505 * will have given us a memory map that we can use to properly
506 * set up memory. If we aren't, we'll fake a memory map.
507 *
508 * We check to see that the memory map contains at least 2 elements
509 * before we'll use it, because the detection code in setup.S may
510 * not be perfect and most every PC known to man has two memory
511 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
512 * thinkpad 560x, for example, does not cooperate with the memory
513 * detection code.)
514 */
515int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
516{
517 /* Only one memory region (or negative)? Ignore it */
518 if (nr_map < 2)
519 return -1;
520
521 do {
522 unsigned long long start = biosmap->addr;
523 unsigned long long size = biosmap->size;
524 unsigned long long end = start + size;
525 unsigned long type = biosmap->type;
526 printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
527
528 /* Overflow in 64 bits? Ignore the memory map. */
529 if (start > end)
530 return -1;
531
532 /*
533 * Some BIOSes claim RAM in the 640k - 1M region.
534 * Not right. Fix it up.
535 */
536 if (type == E820_RAM) {
537 printk("copy_e820_map() type is E820_RAM\n");
538 if (start < 0x100000ULL && end > 0xA0000ULL) {
539 printk("copy_e820_map() lies in range...\n");
540 if (start < 0xA0000ULL) {
541 printk("copy_e820_map() start < 0xA0000ULL\n");
542 add_memory_region(start, 0xA0000ULL-start, type);
543 }
544 if (end <= 0x100000ULL) {
545 printk("copy_e820_map() end <= 0x100000ULL\n");
546 continue;
547 }
548 start = 0x100000ULL;
549 size = end - start;
550 }
551 }
552 add_memory_region(start, size, type);
553 } while (biosmap++,--nr_map);
554 return 0;
555}
556
557/*
558 * Callback for efi_memory_walk.
559 */
560static int __init
561efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
562{
563 unsigned long *max_pfn = arg, pfn;
564
565 if (start < end) {
566 pfn = PFN_UP(end -1);
567 if (pfn > *max_pfn)
568 *max_pfn = pfn;
569 }
570 return 0;
571}
572
573static int __init
574efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
575{
576 memory_present(0, PFN_UP(start), PFN_DOWN(end));
577 return 0;
578}
579
580/*
581 * Find the highest page frame number we have available
582 */
583void __init find_max_pfn(void)
584{
585 int i;
586
587 max_pfn = 0;
588 if (efi_enabled) {
589 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
590 efi_memmap_walk(efi_memory_present_wrapper, NULL);
591 return;
592 }
593
594 for (i = 0; i < e820.nr_map; i++) {
595 unsigned long start, end;
596 /* RAM? */
597 if (e820.map[i].type != E820_RAM)
598 continue;
599 start = PFN_UP(e820.map[i].addr);
600 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
601 if (start >= end)
602 continue;
603 if (end > max_pfn)
604 max_pfn = end;
605 memory_present(0, start, end);
606 }
607}
608
609/*
610 * Free all available memory for boot time allocation. Used
611 * as a callback function by efi_memory_walk()
612 */
613
614static int __init
615free_available_memory(unsigned long start, unsigned long end, void *arg)
616{
617 /* check max_low_pfn */
618 if (start >= (max_low_pfn << PAGE_SHIFT))
619 return 0;
620 if (end >= (max_low_pfn << PAGE_SHIFT))
621 end = max_low_pfn << PAGE_SHIFT;
622 if (start < end)
623 free_bootmem(start, end - start);
624
625 return 0;
626}
627/*
628 * Register fully available low RAM pages with the bootmem allocator.
629 */
630void __init register_bootmem_low_pages(unsigned long max_low_pfn)
631{
632 int i;
633
634 if (efi_enabled) {
635 efi_memmap_walk(free_available_memory, NULL);
636 return;
637 }
638 for (i = 0; i < e820.nr_map; i++) {
639 unsigned long curr_pfn, last_pfn, size;
640 /*
641 * Reserve usable low memory
642 */
643 if (e820.map[i].type != E820_RAM)
644 continue;
645 /*
646 * We are rounding up the start address of usable memory:
647 */
648 curr_pfn = PFN_UP(e820.map[i].addr);
649 if (curr_pfn >= max_low_pfn)
650 continue;
651 /*
652 * ... and at the end of the usable range downwards:
653 */
654 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
655
656 if (last_pfn > max_low_pfn)
657 last_pfn = max_low_pfn;
658
659 /*
660 * .. finally, did all the rounding and playing
661 * around just make the area go away?
662 */
663 if (last_pfn <= curr_pfn)
664 continue;
665
666 size = last_pfn - curr_pfn;
667 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
668 }
669}
670
671void __init register_memory(void)
672{
673 unsigned long gapstart, gapsize, round;
674 unsigned long long last;
675 int i;
676
677 /*
678 * Search for the bigest gap in the low 32 bits of the e820
679 * memory space.
680 */
681 last = 0x100000000ull;
682 gapstart = 0x10000000;
683 gapsize = 0x400000;
684 i = e820.nr_map;
685 while (--i >= 0) {
686 unsigned long long start = e820.map[i].addr;
687 unsigned long long end = start + e820.map[i].size;
688
689 /*
690 * Since "last" is at most 4GB, we know we'll
691 * fit in 32 bits if this condition is true
692 */
693 if (last > end) {
694 unsigned long gap = last - end;
695
696 if (gap > gapsize) {
697 gapsize = gap;
698 gapstart = end;
699 }
700 }
701 if (start < last)
702 last = start;
703 }
704
705 /*
706 * See how much we want to round up: start off with
707 * rounding to the next 1MB area.
708 */
709 round = 0x100000;
710 while ((gapsize >> 4) > round)
711 round += round;
712 /* Fun with two's complement */
713 pci_mem_start = (gapstart + round) & -round;
714
715 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
716 pci_mem_start, gapstart, gapsize);
717}
718
719void __init print_memory_map(char *who)
720{
721 int i;
722
723 for (i = 0; i < e820.nr_map; i++) {
724 printk(" %s: %016Lx - %016Lx ", who,
725 e820.map[i].addr,
726 e820.map[i].addr + e820.map[i].size);
727 switch (e820.map[i].type) {
728 case E820_RAM: printk("(usable)\n");
729 break;
730 case E820_RESERVED:
731 printk("(reserved)\n");
732 break;
733 case E820_ACPI:
734 printk("(ACPI data)\n");
735 break;
736 case E820_NVS:
737 printk("(ACPI NVS)\n");
738 break;
739 default: printk("type %lu\n", e820.map[i].type);
740 break;
741 }
742 }
743}
744
745static __init __always_inline void efi_limit_regions(unsigned long long size)
746{
747 unsigned long long current_addr = 0;
748 efi_memory_desc_t *md, *next_md;
749 void *p, *p1;
750 int i, j;
751
752 j = 0;
753 p1 = memmap.map;
754 for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
755 md = p;
756 next_md = p1;
757 current_addr = md->phys_addr +
758 PFN_PHYS(md->num_pages);
759 if (is_available_memory(md)) {
760 if (md->phys_addr >= size) continue;
761 memcpy(next_md, md, memmap.desc_size);
762 if (current_addr >= size) {
763 next_md->num_pages -=
764 PFN_UP(current_addr-size);
765 }
766 p1 += memmap.desc_size;
767 next_md = p1;
768 j++;
769 } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
770 EFI_MEMORY_RUNTIME) {
771 /* In order to make runtime services
772 * available we have to include runtime
773 * memory regions in memory map */
774 memcpy(next_md, md, memmap.desc_size);
775 p1 += memmap.desc_size;
776 next_md = p1;
777 j++;
778 }
779 }
780 memmap.nr_map = j;
781 memmap.map_end = memmap.map +
782 (memmap.nr_map * memmap.desc_size);
783}
784
785void __init limit_regions(unsigned long long size)
786{
787 unsigned long long current_addr;
788 int i;
789
790 print_memory_map("limit_regions start");
791 if (efi_enabled) {
792 efi_limit_regions(size);
793 return;
794 }
795 for (i = 0; i < e820.nr_map; i++) {
796 current_addr = e820.map[i].addr + e820.map[i].size;
797 if (current_addr < size)
798 continue;
799
800 if (e820.map[i].type != E820_RAM)
801 continue;
802
803 if (e820.map[i].addr >= size) {
804 /*
805 * This region starts past the end of the
806 * requested size, skip it completely.
807 */
808 e820.nr_map = i;
809 } else {
810 e820.nr_map = i + 1;
811 e820.map[i].size -= current_addr - size;
812 }
813 print_memory_map("limit_regions endfor");
814 return;
815 }
816 print_memory_map("limit_regions endfunc");
817}
818
819 /*
820 * This function checks if the entire range <start,end> is mapped with type.
821 *
822 * Note: this function only works correct if the e820 table is sorted and
823 * not-overlapping, which is the case
824 */
825int __init
826e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
827{
828 u64 start = s;
829 u64 end = e;
830 int i;
831 for (i = 0; i < e820.nr_map; i++) {
832 struct e820entry *ei = &e820.map[i];
833 if (type && ei->type != type)
834 continue;
835 /* is the region (part) in overlap with the current region ?*/
836 if (ei->addr >= end || ei->addr + ei->size <= start)
837 continue;
838 /* if the region is at the beginning of <start,end> we move
839 * start to the end of the region since it's ok until there
840 */
841 if (ei->addr <= start)
842 start = ei->addr + ei->size;
843 /* if start is now at or beyond end, we're done, full
844 * coverage */
845 if (start >= end)
846 return 1; /* we're done */
847 }
848 return 0;
849}
850
851static int __init parse_memmap(char *arg)
852{
853 if (!arg)
854 return -EINVAL;
855
856 if (strcmp(arg, "exactmap") == 0) {
857#ifdef CONFIG_CRASH_DUMP
858 /* If we are doing a crash dump, we
859 * still need to know the real mem
860 * size before original memory map is
861 * reset.
862 */
863 find_max_pfn();
864 saved_max_pfn = max_pfn;
865#endif
866 e820.nr_map = 0;
867 user_defined_memmap = 1;
868 } else {
869 /* If the user specifies memory size, we
870 * limit the BIOS-provided memory map to
871 * that size. exactmap can be used to specify
872 * the exact map. mem=number can be used to
873 * trim the existing memory map.
874 */
875 unsigned long long start_at, mem_size;
876
877 mem_size = memparse(arg, &arg);
878 if (*arg == '@') {
879 start_at = memparse(arg+1, &arg);
880 add_memory_region(start_at, mem_size, E820_RAM);
881 } else if (*arg == '#') {
882 start_at = memparse(arg+1, &arg);
883 add_memory_region(start_at, mem_size, E820_ACPI);
884 } else if (*arg == '$') {
885 start_at = memparse(arg+1, &arg);
886 add_memory_region(start_at, mem_size, E820_RESERVED);
887 } else {
888 limit_regions(mem_size);
889 user_defined_memmap = 1;
890 }
891 }
892 return 0;
893}
894early_param("memmap", parse_memmap);
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 8b40648d0ef0..b92c7f0a358a 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -194,17 +194,24 @@ inline int efi_set_rtc_mmss(unsigned long nowtime)
194 return 0; 194 return 0;
195} 195}
196/* 196/*
197 * This should only be used during kernel init and before runtime 197 * This is used during kernel init before runtime
198 * services have been remapped, therefore, we'll need to call in physical 198 * services have been remapped and also during suspend, therefore,
199 * mode. Note, this call isn't used later, so mark it __init. 199 * we'll need to call both in physical and virtual modes.
200 */ 200 */
201inline unsigned long __init efi_get_time(void) 201inline unsigned long efi_get_time(void)
202{ 202{
203 efi_status_t status; 203 efi_status_t status;
204 efi_time_t eft; 204 efi_time_t eft;
205 efi_time_cap_t cap; 205 efi_time_cap_t cap;
206 206
207 status = phys_efi_get_time(&eft, &cap); 207 if (efi.get_time) {
208 /* if we are in virtual mode use remapped function */
209 status = efi.get_time(&eft, &cap);
210 } else {
211 /* we are in physical mode */
212 status = phys_efi_get_time(&eft, &cap);
213 }
214
208 if (status != EFI_SUCCESS) 215 if (status != EFI_SUCCESS)
209 printk("Oops: efitime: can't read time status: 0x%lx\n",status); 216 printk("Oops: efitime: can't read time status: 0x%lx\n",status);
210 217
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5a63d6fdb70e..de34b7fed3c1 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -30,12 +30,13 @@
30 * 18(%esp) - %eax 30 * 18(%esp) - %eax
31 * 1C(%esp) - %ds 31 * 1C(%esp) - %ds
32 * 20(%esp) - %es 32 * 20(%esp) - %es
33 * 24(%esp) - orig_eax 33 * 24(%esp) - %gs
34 * 28(%esp) - %eip 34 * 28(%esp) - orig_eax
35 * 2C(%esp) - %cs 35 * 2C(%esp) - %eip
36 * 30(%esp) - %eflags 36 * 30(%esp) - %cs
37 * 34(%esp) - %oldesp 37 * 34(%esp) - %eflags
38 * 38(%esp) - %oldss 38 * 38(%esp) - %oldesp
39 * 3C(%esp) - %oldss
39 * 40 *
40 * "current" is in register %ebx during any slow entries. 41 * "current" is in register %ebx during any slow entries.
41 */ 42 */
@@ -48,26 +49,24 @@
48#include <asm/smp.h> 49#include <asm/smp.h>
49#include <asm/page.h> 50#include <asm/page.h>
50#include <asm/desc.h> 51#include <asm/desc.h>
52#include <asm/percpu.h>
51#include <asm/dwarf2.h> 53#include <asm/dwarf2.h>
52#include "irq_vectors.h" 54#include "irq_vectors.h"
53 55
54#define nr_syscalls ((syscall_table_size)/4) 56/*
57 * We use macros for low-level operations which need to be overridden
58 * for paravirtualization. The following will never clobber any registers:
59 * INTERRUPT_RETURN (aka. "iret")
60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
61 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
62 *
63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
65 * Allowing a register to be clobbered can shrink the paravirt replacement
66 * enough to patch inline, increasing performance.
67 */
55 68
56EBX = 0x00 69#define nr_syscalls ((syscall_table_size)/4)
57ECX = 0x04
58EDX = 0x08
59ESI = 0x0C
60EDI = 0x10
61EBP = 0x14
62EAX = 0x18
63DS = 0x1C
64ES = 0x20
65ORIG_EAX = 0x24
66EIP = 0x28
67CS = 0x2C
68EFLAGS = 0x30
69OLDESP = 0x34
70OLDSS = 0x38
71 70
72CF_MASK = 0x00000001 71CF_MASK = 0x00000001
73TF_MASK = 0x00000100 72TF_MASK = 0x00000100
@@ -76,23 +75,16 @@ DF_MASK = 0x00000400
76NT_MASK = 0x00004000 75NT_MASK = 0x00004000
77VM_MASK = 0x00020000 76VM_MASK = 0x00020000
78 77
79/* These are replaces for paravirtualization */
80#define DISABLE_INTERRUPTS cli
81#define ENABLE_INTERRUPTS sti
82#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
83#define INTERRUPT_RETURN iret
84#define GET_CR0_INTO_EAX movl %cr0, %eax
85
86#ifdef CONFIG_PREEMPT 78#ifdef CONFIG_PREEMPT
87#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF 79#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
88#else 80#else
89#define preempt_stop 81#define preempt_stop(clobbers)
90#define resume_kernel restore_nocheck 82#define resume_kernel restore_nocheck
91#endif 83#endif
92 84
93.macro TRACE_IRQS_IRET 85.macro TRACE_IRQS_IRET
94#ifdef CONFIG_TRACE_IRQFLAGS 86#ifdef CONFIG_TRACE_IRQFLAGS
95 testl $IF_MASK,EFLAGS(%esp) # interrupts off? 87 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
96 jz 1f 88 jz 1f
97 TRACE_IRQS_ON 89 TRACE_IRQS_ON
981: 901:
@@ -107,6 +99,9 @@ VM_MASK = 0x00020000
107 99
108#define SAVE_ALL \ 100#define SAVE_ALL \
109 cld; \ 101 cld; \
102 pushl %gs; \
103 CFI_ADJUST_CFA_OFFSET 4;\
104 /*CFI_REL_OFFSET gs, 0;*/\
110 pushl %es; \ 105 pushl %es; \
111 CFI_ADJUST_CFA_OFFSET 4;\ 106 CFI_ADJUST_CFA_OFFSET 4;\
112 /*CFI_REL_OFFSET es, 0;*/\ 107 /*CFI_REL_OFFSET es, 0;*/\
@@ -136,7 +131,9 @@ VM_MASK = 0x00020000
136 CFI_REL_OFFSET ebx, 0;\ 131 CFI_REL_OFFSET ebx, 0;\
137 movl $(__USER_DS), %edx; \ 132 movl $(__USER_DS), %edx; \
138 movl %edx, %ds; \ 133 movl %edx, %ds; \
139 movl %edx, %es; 134 movl %edx, %es; \
135 movl $(__KERNEL_PDA), %edx; \
136 movl %edx, %gs
140 137
141#define RESTORE_INT_REGS \ 138#define RESTORE_INT_REGS \
142 popl %ebx; \ 139 popl %ebx; \
@@ -169,17 +166,22 @@ VM_MASK = 0x00020000
1692: popl %es; \ 1662: popl %es; \
170 CFI_ADJUST_CFA_OFFSET -4;\ 167 CFI_ADJUST_CFA_OFFSET -4;\
171 /*CFI_RESTORE es;*/\ 168 /*CFI_RESTORE es;*/\
172.section .fixup,"ax"; \ 1693: popl %gs; \
1733: movl $0,(%esp); \ 170 CFI_ADJUST_CFA_OFFSET -4;\
174 jmp 1b; \ 171 /*CFI_RESTORE gs;*/\
172.pushsection .fixup,"ax"; \
1754: movl $0,(%esp); \ 1734: movl $0,(%esp); \
174 jmp 1b; \
1755: movl $0,(%esp); \
176 jmp 2b; \ 176 jmp 2b; \
177.previous; \ 1776: movl $0,(%esp); \
178 jmp 3b; \
178.section __ex_table,"a";\ 179.section __ex_table,"a";\
179 .align 4; \ 180 .align 4; \
180 .long 1b,3b; \ 181 .long 1b,4b; \
181 .long 2b,4b; \ 182 .long 2b,5b; \
182.previous 183 .long 3b,6b; \
184.popsection
183 185
184#define RING0_INT_FRAME \ 186#define RING0_INT_FRAME \
185 CFI_STARTPROC simple;\ 187 CFI_STARTPROC simple;\
@@ -198,18 +200,18 @@ VM_MASK = 0x00020000
198#define RING0_PTREGS_FRAME \ 200#define RING0_PTREGS_FRAME \
199 CFI_STARTPROC simple;\ 201 CFI_STARTPROC simple;\
200 CFI_SIGNAL_FRAME;\ 202 CFI_SIGNAL_FRAME;\
201 CFI_DEF_CFA esp, OLDESP-EBX;\ 203 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
202 /*CFI_OFFSET cs, CS-OLDESP;*/\ 204 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
203 CFI_OFFSET eip, EIP-OLDESP;\ 205 CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
204 /*CFI_OFFSET es, ES-OLDESP;*/\ 206 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
205 /*CFI_OFFSET ds, DS-OLDESP;*/\ 207 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
206 CFI_OFFSET eax, EAX-OLDESP;\ 208 CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
207 CFI_OFFSET ebp, EBP-OLDESP;\ 209 CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
208 CFI_OFFSET edi, EDI-OLDESP;\ 210 CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
209 CFI_OFFSET esi, ESI-OLDESP;\ 211 CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
210 CFI_OFFSET edx, EDX-OLDESP;\ 212 CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
211 CFI_OFFSET ecx, ECX-OLDESP;\ 213 CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
212 CFI_OFFSET ebx, EBX-OLDESP 214 CFI_OFFSET ebx, PT_EBX-PT_OLDESP
213 215
214ENTRY(ret_from_fork) 216ENTRY(ret_from_fork)
215 CFI_STARTPROC 217 CFI_STARTPROC
@@ -237,17 +239,18 @@ ENTRY(ret_from_fork)
237 ALIGN 239 ALIGN
238 RING0_PTREGS_FRAME 240 RING0_PTREGS_FRAME
239ret_from_exception: 241ret_from_exception:
240 preempt_stop 242 preempt_stop(CLBR_ANY)
241ret_from_intr: 243ret_from_intr:
242 GET_THREAD_INFO(%ebp) 244 GET_THREAD_INFO(%ebp)
243check_userspace: 245check_userspace:
244 movl EFLAGS(%esp), %eax # mix EFLAGS and CS 246 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
245 movb CS(%esp), %al 247 movb PT_CS(%esp), %al
246 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax 248 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
247 cmpl $USER_RPL, %eax 249 cmpl $USER_RPL, %eax
248 jb resume_kernel # not returning to v8086 or userspace 250 jb resume_kernel # not returning to v8086 or userspace
251
249ENTRY(resume_userspace) 252ENTRY(resume_userspace)
250 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 253 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
251 # setting need_resched or sigpending 254 # setting need_resched or sigpending
252 # between sampling and the iret 255 # between sampling and the iret
253 movl TI_flags(%ebp), %ecx 256 movl TI_flags(%ebp), %ecx
@@ -258,14 +261,14 @@ ENTRY(resume_userspace)
258 261
259#ifdef CONFIG_PREEMPT 262#ifdef CONFIG_PREEMPT
260ENTRY(resume_kernel) 263ENTRY(resume_kernel)
261 DISABLE_INTERRUPTS 264 DISABLE_INTERRUPTS(CLBR_ANY)
262 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? 265 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
263 jnz restore_nocheck 266 jnz restore_nocheck
264need_resched: 267need_resched:
265 movl TI_flags(%ebp), %ecx # need_resched set ? 268 movl TI_flags(%ebp), %ecx # need_resched set ?
266 testb $_TIF_NEED_RESCHED, %cl 269 testb $_TIF_NEED_RESCHED, %cl
267 jz restore_all 270 jz restore_all
268 testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? 271 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
269 jz restore_all 272 jz restore_all
270 call preempt_schedule_irq 273 call preempt_schedule_irq
271 jmp need_resched 274 jmp need_resched
@@ -287,7 +290,7 @@ sysenter_past_esp:
287 * No need to follow this irqs on/off section: the syscall 290 * No need to follow this irqs on/off section: the syscall
288 * disabled irqs and here we enable it straight after entry: 291 * disabled irqs and here we enable it straight after entry:
289 */ 292 */
290 ENABLE_INTERRUPTS 293 ENABLE_INTERRUPTS(CLBR_NONE)
291 pushl $(__USER_DS) 294 pushl $(__USER_DS)
292 CFI_ADJUST_CFA_OFFSET 4 295 CFI_ADJUST_CFA_OFFSET 4
293 /*CFI_REL_OFFSET ss, 0*/ 296 /*CFI_REL_OFFSET ss, 0*/
@@ -331,20 +334,27 @@ sysenter_past_esp:
331 cmpl $(nr_syscalls), %eax 334 cmpl $(nr_syscalls), %eax
332 jae syscall_badsys 335 jae syscall_badsys
333 call *sys_call_table(,%eax,4) 336 call *sys_call_table(,%eax,4)
334 movl %eax,EAX(%esp) 337 movl %eax,PT_EAX(%esp)
335 DISABLE_INTERRUPTS 338 DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
336 TRACE_IRQS_OFF 339 TRACE_IRQS_OFF
337 movl TI_flags(%ebp), %ecx 340 movl TI_flags(%ebp), %ecx
338 testw $_TIF_ALLWORK_MASK, %cx 341 testw $_TIF_ALLWORK_MASK, %cx
339 jne syscall_exit_work 342 jne syscall_exit_work
340/* if something modifies registers it must also disable sysexit */ 343/* if something modifies registers it must also disable sysexit */
341 movl EIP(%esp), %edx 344 movl PT_EIP(%esp), %edx
342 movl OLDESP(%esp), %ecx 345 movl PT_OLDESP(%esp), %ecx
343 xorl %ebp,%ebp 346 xorl %ebp,%ebp
344 TRACE_IRQS_ON 347 TRACE_IRQS_ON
3481: mov PT_GS(%esp), %gs
345 ENABLE_INTERRUPTS_SYSEXIT 349 ENABLE_INTERRUPTS_SYSEXIT
346 CFI_ENDPROC 350 CFI_ENDPROC
347 351.pushsection .fixup,"ax"
3522: movl $0,PT_GS(%esp)
353 jmp 1b
354.section __ex_table,"a"
355 .align 4
356 .long 1b,2b
357.popsection
348 358
349 # system call handler stub 359 # system call handler stub
350ENTRY(system_call) 360ENTRY(system_call)
@@ -353,7 +363,7 @@ ENTRY(system_call)
353 CFI_ADJUST_CFA_OFFSET 4 363 CFI_ADJUST_CFA_OFFSET 4
354 SAVE_ALL 364 SAVE_ALL
355 GET_THREAD_INFO(%ebp) 365 GET_THREAD_INFO(%ebp)
356 testl $TF_MASK,EFLAGS(%esp) 366 testl $TF_MASK,PT_EFLAGS(%esp)
357 jz no_singlestep 367 jz no_singlestep
358 orl $_TIF_SINGLESTEP,TI_flags(%ebp) 368 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
359no_singlestep: 369no_singlestep:
@@ -365,9 +375,9 @@ no_singlestep:
365 jae syscall_badsys 375 jae syscall_badsys
366syscall_call: 376syscall_call:
367 call *sys_call_table(,%eax,4) 377 call *sys_call_table(,%eax,4)
368 movl %eax,EAX(%esp) # store the return value 378 movl %eax,PT_EAX(%esp) # store the return value
369syscall_exit: 379syscall_exit:
370 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 380 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
371 # setting need_resched or sigpending 381 # setting need_resched or sigpending
372 # between sampling and the iret 382 # between sampling and the iret
373 TRACE_IRQS_OFF 383 TRACE_IRQS_OFF
@@ -376,12 +386,12 @@ syscall_exit:
376 jne syscall_exit_work 386 jne syscall_exit_work
377 387
378restore_all: 388restore_all:
379 movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 389 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
380 # Warning: OLDSS(%esp) contains the wrong/random values if we 390 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
381 # are returning to the kernel. 391 # are returning to the kernel.
382 # See comments in process.c:copy_thread() for details. 392 # See comments in process.c:copy_thread() for details.
383 movb OLDSS(%esp), %ah 393 movb PT_OLDSS(%esp), %ah
384 movb CS(%esp), %al 394 movb PT_CS(%esp), %al
385 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax 395 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
386 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax 396 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
387 CFI_REMEMBER_STATE 397 CFI_REMEMBER_STATE
@@ -390,13 +400,13 @@ restore_nocheck:
390 TRACE_IRQS_IRET 400 TRACE_IRQS_IRET
391restore_nocheck_notrace: 401restore_nocheck_notrace:
392 RESTORE_REGS 402 RESTORE_REGS
393 addl $4, %esp 403 addl $4, %esp # skip orig_eax/error_code
394 CFI_ADJUST_CFA_OFFSET -4 404 CFI_ADJUST_CFA_OFFSET -4
3951: INTERRUPT_RETURN 4051: INTERRUPT_RETURN
396.section .fixup,"ax" 406.section .fixup,"ax"
397iret_exc: 407iret_exc:
398 TRACE_IRQS_ON 408 TRACE_IRQS_ON
399 ENABLE_INTERRUPTS 409 ENABLE_INTERRUPTS(CLBR_NONE)
400 pushl $0 # no error code 410 pushl $0 # no error code
401 pushl $do_iret_error 411 pushl $do_iret_error
402 jmp error_code 412 jmp error_code
@@ -408,33 +418,42 @@ iret_exc:
408 418
409 CFI_RESTORE_STATE 419 CFI_RESTORE_STATE
410ldt_ss: 420ldt_ss:
411 larl OLDSS(%esp), %eax 421 larl PT_OLDSS(%esp), %eax
412 jnz restore_nocheck 422 jnz restore_nocheck
413 testl $0x00400000, %eax # returning to 32bit stack? 423 testl $0x00400000, %eax # returning to 32bit stack?
414 jnz restore_nocheck # allright, normal return 424 jnz restore_nocheck # allright, normal return
425
426#ifdef CONFIG_PARAVIRT
427 /*
428 * The kernel can't run on a non-flat stack if paravirt mode
429 * is active. Rather than try to fixup the high bits of
430 * ESP, bypass this code entirely. This may break DOSemu
431 * and/or Wine support in a paravirt VM, although the option
432 * is still available to implement the setting of the high
433 * 16-bits in the INTERRUPT_RETURN paravirt-op.
434 */
435 cmpl $0, paravirt_ops+PARAVIRT_enabled
436 jne restore_nocheck
437#endif
438
415 /* If returning to userspace with 16bit stack, 439 /* If returning to userspace with 16bit stack,
416 * try to fix the higher word of ESP, as the CPU 440 * try to fix the higher word of ESP, as the CPU
417 * won't restore it. 441 * won't restore it.
418 * This is an "official" bug of all the x86-compatible 442 * This is an "official" bug of all the x86-compatible
419 * CPUs, which we can try to work around to make 443 * CPUs, which we can try to work around to make
420 * dosemu and wine happy. */ 444 * dosemu and wine happy. */
421 subl $8, %esp # reserve space for switch16 pointer 445 movl PT_OLDESP(%esp), %eax
422 CFI_ADJUST_CFA_OFFSET 8 446 movl %esp, %edx
423 DISABLE_INTERRUPTS 447 call patch_espfix_desc
448 pushl $__ESPFIX_SS
449 CFI_ADJUST_CFA_OFFSET 4
450 pushl %eax
451 CFI_ADJUST_CFA_OFFSET 4
452 DISABLE_INTERRUPTS(CLBR_EAX)
424 TRACE_IRQS_OFF 453 TRACE_IRQS_OFF
425 movl %esp, %eax 454 lss (%esp), %esp
426 /* Set up the 16bit stack frame with switch32 pointer on top, 455 CFI_ADJUST_CFA_OFFSET -8
427 * and a switch16 pointer on top of the current frame. */ 456 jmp restore_nocheck
428 call setup_x86_bogus_stack
429 CFI_ADJUST_CFA_OFFSET -8 # frame has moved
430 TRACE_IRQS_IRET
431 RESTORE_REGS
432 lss 20+4(%esp), %esp # switch to 16bit stack
4331: INTERRUPT_RETURN
434.section __ex_table,"a"
435 .align 4
436 .long 1b,iret_exc
437.previous
438 CFI_ENDPROC 457 CFI_ENDPROC
439 458
440 # perform work that needs to be done immediately before resumption 459 # perform work that needs to be done immediately before resumption
@@ -445,7 +464,7 @@ work_pending:
445 jz work_notifysig 464 jz work_notifysig
446work_resched: 465work_resched:
447 call schedule 466 call schedule
448 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 467 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
449 # setting need_resched or sigpending 468 # setting need_resched or sigpending
450 # between sampling and the iret 469 # between sampling and the iret
451 TRACE_IRQS_OFF 470 TRACE_IRQS_OFF
@@ -458,7 +477,8 @@ work_resched:
458 477
459work_notifysig: # deal with pending signals and 478work_notifysig: # deal with pending signals and
460 # notify-resume requests 479 # notify-resume requests
461 testl $VM_MASK, EFLAGS(%esp) 480#ifdef CONFIG_VM86
481 testl $VM_MASK, PT_EFLAGS(%esp)
462 movl %esp, %eax 482 movl %esp, %eax
463 jne work_notifysig_v86 # returning to kernel-space or 483 jne work_notifysig_v86 # returning to kernel-space or
464 # vm86-space 484 # vm86-space
@@ -468,29 +488,30 @@ work_notifysig: # deal with pending signals and
468 488
469 ALIGN 489 ALIGN
470work_notifysig_v86: 490work_notifysig_v86:
471#ifdef CONFIG_VM86
472 pushl %ecx # save ti_flags for do_notify_resume 491 pushl %ecx # save ti_flags for do_notify_resume
473 CFI_ADJUST_CFA_OFFSET 4 492 CFI_ADJUST_CFA_OFFSET 4
474 call save_v86_state # %eax contains pt_regs pointer 493 call save_v86_state # %eax contains pt_regs pointer
475 popl %ecx 494 popl %ecx
476 CFI_ADJUST_CFA_OFFSET -4 495 CFI_ADJUST_CFA_OFFSET -4
477 movl %eax, %esp 496 movl %eax, %esp
497#else
498 movl %esp, %eax
499#endif
478 xorl %edx, %edx 500 xorl %edx, %edx
479 call do_notify_resume 501 call do_notify_resume
480 jmp resume_userspace_sig 502 jmp resume_userspace_sig
481#endif
482 503
483 # perform syscall exit tracing 504 # perform syscall exit tracing
484 ALIGN 505 ALIGN
485syscall_trace_entry: 506syscall_trace_entry:
486 movl $-ENOSYS,EAX(%esp) 507 movl $-ENOSYS,PT_EAX(%esp)
487 movl %esp, %eax 508 movl %esp, %eax
488 xorl %edx,%edx 509 xorl %edx,%edx
489 call do_syscall_trace 510 call do_syscall_trace
490 cmpl $0, %eax 511 cmpl $0, %eax
491 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, 512 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
492 # so must skip actual syscall 513 # so must skip actual syscall
493 movl ORIG_EAX(%esp), %eax 514 movl PT_ORIG_EAX(%esp), %eax
494 cmpl $(nr_syscalls), %eax 515 cmpl $(nr_syscalls), %eax
495 jnae syscall_call 516 jnae syscall_call
496 jmp syscall_exit 517 jmp syscall_exit
@@ -501,7 +522,7 @@ syscall_exit_work:
501 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 522 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
502 jz work_pending 523 jz work_pending
503 TRACE_IRQS_ON 524 TRACE_IRQS_ON
504 ENABLE_INTERRUPTS # could let do_syscall_trace() call 525 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
505 # schedule() instead 526 # schedule() instead
506 movl %esp, %eax 527 movl %esp, %eax
507 movl $1, %edx 528 movl $1, %edx
@@ -515,39 +536,38 @@ syscall_fault:
515 CFI_ADJUST_CFA_OFFSET 4 536 CFI_ADJUST_CFA_OFFSET 4
516 SAVE_ALL 537 SAVE_ALL
517 GET_THREAD_INFO(%ebp) 538 GET_THREAD_INFO(%ebp)
518 movl $-EFAULT,EAX(%esp) 539 movl $-EFAULT,PT_EAX(%esp)
519 jmp resume_userspace 540 jmp resume_userspace
520 541
521syscall_badsys: 542syscall_badsys:
522 movl $-ENOSYS,EAX(%esp) 543 movl $-ENOSYS,PT_EAX(%esp)
523 jmp resume_userspace 544 jmp resume_userspace
524 CFI_ENDPROC 545 CFI_ENDPROC
525 546
526#define FIXUP_ESPFIX_STACK \ 547#define FIXUP_ESPFIX_STACK \
527 movl %esp, %eax; \ 548 /* since we are on a wrong stack, we cant make it a C code :( */ \
528 /* switch to 32bit stack using the pointer on top of 16bit stack */ \ 549 movl %gs:PDA_cpu, %ebx; \
529 lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ 550 PER_CPU(cpu_gdt_descr, %ebx); \
530 /* copy data from 16bit stack to 32bit stack */ \ 551 movl GDS_address(%ebx), %ebx; \
531 call fixup_x86_bogus_stack; \ 552 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
532 /* put ESP to the proper location */ \ 553 addl %esp, %eax; \
533 movl %eax, %esp; 554 pushl $__KERNEL_DS; \
534#define UNWIND_ESPFIX_STACK \ 555 CFI_ADJUST_CFA_OFFSET 4; \
535 pushl %eax; \ 556 pushl %eax; \
536 CFI_ADJUST_CFA_OFFSET 4; \ 557 CFI_ADJUST_CFA_OFFSET 4; \
558 lss (%esp), %esp; \
559 CFI_ADJUST_CFA_OFFSET -8;
560#define UNWIND_ESPFIX_STACK \
537 movl %ss, %eax; \ 561 movl %ss, %eax; \
538 /* see if on 16bit stack */ \ 562 /* see if on espfix stack */ \
539 cmpw $__ESPFIX_SS, %ax; \ 563 cmpw $__ESPFIX_SS, %ax; \
540 je 28f; \ 564 jne 27f; \
54127: popl %eax; \ 565 movl $__KERNEL_DS, %eax; \
542 CFI_ADJUST_CFA_OFFSET -4; \
543.section .fixup,"ax"; \
54428: movl $__KERNEL_DS, %eax; \
545 movl %eax, %ds; \ 566 movl %eax, %ds; \
546 movl %eax, %es; \ 567 movl %eax, %es; \
547 /* switch to 32bit stack */ \ 568 /* switch to normal stack */ \
548 FIXUP_ESPFIX_STACK; \ 569 FIXUP_ESPFIX_STACK; \
549 jmp 27b; \ 57027:;
550.previous
551 571
552/* 572/*
553 * Build the entry stubs and pointer table with 573 * Build the entry stubs and pointer table with
@@ -608,13 +628,16 @@ KPROBE_ENTRY(page_fault)
608 CFI_ADJUST_CFA_OFFSET 4 628 CFI_ADJUST_CFA_OFFSET 4
609 ALIGN 629 ALIGN
610error_code: 630error_code:
631 /* the function address is in %gs's slot on the stack */
632 pushl %es
633 CFI_ADJUST_CFA_OFFSET 4
634 /*CFI_REL_OFFSET es, 0*/
611 pushl %ds 635 pushl %ds
612 CFI_ADJUST_CFA_OFFSET 4 636 CFI_ADJUST_CFA_OFFSET 4
613 /*CFI_REL_OFFSET ds, 0*/ 637 /*CFI_REL_OFFSET ds, 0*/
614 pushl %eax 638 pushl %eax
615 CFI_ADJUST_CFA_OFFSET 4 639 CFI_ADJUST_CFA_OFFSET 4
616 CFI_REL_OFFSET eax, 0 640 CFI_REL_OFFSET eax, 0
617 xorl %eax, %eax
618 pushl %ebp 641 pushl %ebp
619 CFI_ADJUST_CFA_OFFSET 4 642 CFI_ADJUST_CFA_OFFSET 4
620 CFI_REL_OFFSET ebp, 0 643 CFI_REL_OFFSET ebp, 0
@@ -627,7 +650,6 @@ error_code:
627 pushl %edx 650 pushl %edx
628 CFI_ADJUST_CFA_OFFSET 4 651 CFI_ADJUST_CFA_OFFSET 4
629 CFI_REL_OFFSET edx, 0 652 CFI_REL_OFFSET edx, 0
630 decl %eax # eax = -1
631 pushl %ecx 653 pushl %ecx
632 CFI_ADJUST_CFA_OFFSET 4 654 CFI_ADJUST_CFA_OFFSET 4
633 CFI_REL_OFFSET ecx, 0 655 CFI_REL_OFFSET ecx, 0
@@ -635,18 +657,20 @@ error_code:
635 CFI_ADJUST_CFA_OFFSET 4 657 CFI_ADJUST_CFA_OFFSET 4
636 CFI_REL_OFFSET ebx, 0 658 CFI_REL_OFFSET ebx, 0
637 cld 659 cld
638 pushl %es 660 pushl %gs
639 CFI_ADJUST_CFA_OFFSET 4 661 CFI_ADJUST_CFA_OFFSET 4
640 /*CFI_REL_OFFSET es, 0*/ 662 /*CFI_REL_OFFSET gs, 0*/
663 movl $(__KERNEL_PDA), %ecx
664 movl %ecx, %gs
641 UNWIND_ESPFIX_STACK 665 UNWIND_ESPFIX_STACK
642 popl %ecx 666 popl %ecx
643 CFI_ADJUST_CFA_OFFSET -4 667 CFI_ADJUST_CFA_OFFSET -4
644 /*CFI_REGISTER es, ecx*/ 668 /*CFI_REGISTER es, ecx*/
645 movl ES(%esp), %edi # get the function address 669 movl PT_GS(%esp), %edi # get the function address
646 movl ORIG_EAX(%esp), %edx # get the error code 670 movl PT_ORIG_EAX(%esp), %edx # get the error code
647 movl %eax, ORIG_EAX(%esp) 671 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
648 movl %ecx, ES(%esp) 672 mov %ecx, PT_GS(%esp)
649 /*CFI_REL_OFFSET es, ES*/ 673 /*CFI_REL_OFFSET gs, ES*/
650 movl $(__USER_DS), %ecx 674 movl $(__USER_DS), %ecx
651 movl %ecx, %ds 675 movl %ecx, %ds
652 movl %ecx, %es 676 movl %ecx, %es
@@ -682,7 +706,7 @@ ENTRY(device_not_available)
682 GET_CR0_INTO_EAX 706 GET_CR0_INTO_EAX
683 testl $0x4, %eax # EM (math emulation bit) 707 testl $0x4, %eax # EM (math emulation bit)
684 jne device_not_available_emulate 708 jne device_not_available_emulate
685 preempt_stop 709 preempt_stop(CLBR_ANY)
686 call math_state_restore 710 call math_state_restore
687 jmp ret_from_exception 711 jmp ret_from_exception
688device_not_available_emulate: 712device_not_available_emulate:
@@ -754,7 +778,7 @@ KPROBE_ENTRY(nmi)
754 cmpw $__ESPFIX_SS, %ax 778 cmpw $__ESPFIX_SS, %ax
755 popl %eax 779 popl %eax
756 CFI_ADJUST_CFA_OFFSET -4 780 CFI_ADJUST_CFA_OFFSET -4
757 je nmi_16bit_stack 781 je nmi_espfix_stack
758 cmpl $sysenter_entry,(%esp) 782 cmpl $sysenter_entry,(%esp)
759 je nmi_stack_fixup 783 je nmi_stack_fixup
760 pushl %eax 784 pushl %eax
@@ -797,7 +821,7 @@ nmi_debug_stack_check:
797 FIX_STACK(24,nmi_stack_correct, 1) 821 FIX_STACK(24,nmi_stack_correct, 1)
798 jmp nmi_stack_correct 822 jmp nmi_stack_correct
799 823
800nmi_16bit_stack: 824nmi_espfix_stack:
801 /* We have a RING0_INT_FRAME here. 825 /* We have a RING0_INT_FRAME here.
802 * 826 *
803 * create the pointer to lss back 827 * create the pointer to lss back
@@ -806,7 +830,6 @@ nmi_16bit_stack:
806 CFI_ADJUST_CFA_OFFSET 4 830 CFI_ADJUST_CFA_OFFSET 4
807 pushl %esp 831 pushl %esp
808 CFI_ADJUST_CFA_OFFSET 4 832 CFI_ADJUST_CFA_OFFSET 4
809 movzwl %sp, %esp
810 addw $4, (%esp) 833 addw $4, (%esp)
811 /* copy the iret frame of 12 bytes */ 834 /* copy the iret frame of 12 bytes */
812 .rept 3 835 .rept 3
@@ -817,11 +840,11 @@ nmi_16bit_stack:
817 CFI_ADJUST_CFA_OFFSET 4 840 CFI_ADJUST_CFA_OFFSET 4
818 SAVE_ALL 841 SAVE_ALL
819 FIXUP_ESPFIX_STACK # %eax == %esp 842 FIXUP_ESPFIX_STACK # %eax == %esp
820 CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
821 xorl %edx,%edx # zero error code 843 xorl %edx,%edx # zero error code
822 call do_nmi 844 call do_nmi
823 RESTORE_REGS 845 RESTORE_REGS
824 lss 12+4(%esp), %esp # back to 16bit stack 846 lss 12+4(%esp), %esp # back to espfix stack
847 CFI_ADJUST_CFA_OFFSET -24
8251: INTERRUPT_RETURN 8481: INTERRUPT_RETURN
826 CFI_ENDPROC 849 CFI_ENDPROC
827.section __ex_table,"a" 850.section __ex_table,"a"
@@ -830,6 +853,19 @@ nmi_16bit_stack:
830.previous 853.previous
831KPROBE_END(nmi) 854KPROBE_END(nmi)
832 855
856#ifdef CONFIG_PARAVIRT
857ENTRY(native_iret)
8581: iret
859.section __ex_table,"a"
860 .align 4
861 .long 1b,iret_exc
862.previous
863
864ENTRY(native_irq_enable_sysexit)
865 sti
866 sysexit
867#endif
868
833KPROBE_ENTRY(int3) 869KPROBE_ENTRY(int3)
834 RING0_INT_FRAME 870 RING0_INT_FRAME
835 pushl $-1 # mark this as an int 871 pushl $-1 # mark this as an int
@@ -949,26 +985,27 @@ ENTRY(arch_unwind_init_running)
949 movl 4(%esp), %edx 985 movl 4(%esp), %edx
950 movl (%esp), %ecx 986 movl (%esp), %ecx
951 leal 4(%esp), %eax 987 leal 4(%esp), %eax
952 movl %ebx, EBX(%edx) 988 movl %ebx, PT_EBX(%edx)
953 xorl %ebx, %ebx 989 xorl %ebx, %ebx
954 movl %ebx, ECX(%edx) 990 movl %ebx, PT_ECX(%edx)
955 movl %ebx, EDX(%edx) 991 movl %ebx, PT_EDX(%edx)
956 movl %esi, ESI(%edx) 992 movl %esi, PT_ESI(%edx)
957 movl %edi, EDI(%edx) 993 movl %edi, PT_EDI(%edx)
958 movl %ebp, EBP(%edx) 994 movl %ebp, PT_EBP(%edx)
959 movl %ebx, EAX(%edx) 995 movl %ebx, PT_EAX(%edx)
960 movl $__USER_DS, DS(%edx) 996 movl $__USER_DS, PT_DS(%edx)
961 movl $__USER_DS, ES(%edx) 997 movl $__USER_DS, PT_ES(%edx)
962 movl %ebx, ORIG_EAX(%edx) 998 movl $0, PT_GS(%edx)
963 movl %ecx, EIP(%edx) 999 movl %ebx, PT_ORIG_EAX(%edx)
1000 movl %ecx, PT_EIP(%edx)
964 movl 12(%esp), %ecx 1001 movl 12(%esp), %ecx
965 movl $__KERNEL_CS, CS(%edx) 1002 movl $__KERNEL_CS, PT_CS(%edx)
966 movl %ebx, EFLAGS(%edx) 1003 movl %ebx, PT_EFLAGS(%edx)
967 movl %eax, OLDESP(%edx) 1004 movl %eax, PT_OLDESP(%edx)
968 movl 8(%esp), %eax 1005 movl 8(%esp), %eax
969 movl %ecx, 8(%esp) 1006 movl %ecx, 8(%esp)
970 movl EBX(%edx), %ebx 1007 movl PT_EBX(%edx), %ebx
971 movl $__KERNEL_DS, OLDSS(%edx) 1008 movl $__KERNEL_DS, PT_OLDSS(%edx)
972 jmpl *%eax 1009 jmpl *%eax
973 CFI_ENDPROC 1010 CFI_ENDPROC
974ENDPROC(arch_unwind_init_running) 1011ENDPROC(arch_unwind_init_running)
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index be9d883c62ce..edef5084ce17 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -55,6 +55,12 @@
55 */ 55 */
56ENTRY(startup_32) 56ENTRY(startup_32)
57 57
58#ifdef CONFIG_PARAVIRT
59 movl %cs, %eax
60 testl $0x3, %eax
61 jnz startup_paravirt
62#endif
63
58/* 64/*
59 * Set segments to known values. 65 * Set segments to known values.
60 */ 66 */
@@ -302,6 +308,7 @@ is386: movl $2,%ecx # set MP
302 movl %eax,%cr0 308 movl %eax,%cr0
303 309
304 call check_x87 310 call check_x87
311 call setup_pda
305 lgdt cpu_gdt_descr 312 lgdt cpu_gdt_descr
306 lidt idt_descr 313 lidt idt_descr
307 ljmp $(__KERNEL_CS),$1f 314 ljmp $(__KERNEL_CS),$1f
@@ -312,12 +319,15 @@ is386: movl $2,%ecx # set MP
312 movl %eax,%ds 319 movl %eax,%ds
313 movl %eax,%es 320 movl %eax,%es
314 321
315 xorl %eax,%eax # Clear FS/GS and LDT 322 xorl %eax,%eax # Clear FS and LDT
316 movl %eax,%fs 323 movl %eax,%fs
317 movl %eax,%gs
318 lldt %ax 324 lldt %ax
325
326 movl $(__KERNEL_PDA),%eax
327 mov %eax,%gs
328
319 cld # gcc2 wants the direction flag cleared at all times 329 cld # gcc2 wants the direction flag cleared at all times
320 pushl %eax # fake return address 330 pushl $0 # fake return address for unwinder
321#ifdef CONFIG_SMP 331#ifdef CONFIG_SMP
322 movb ready, %cl 332 movb ready, %cl
323 movb $1, ready 333 movb $1, ready
@@ -346,6 +356,23 @@ check_x87:
346 ret 356 ret
347 357
348/* 358/*
359 * Point the GDT at this CPU's PDA. On boot this will be
360 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
361 * that CPU's GDT and PDA.
362 */
363setup_pda:
364 /* get the PDA pointer */
365 movl start_pda, %eax
366
367 /* slot the PDA address into the GDT */
368 mov cpu_gdt_descr+2, %ecx
369 mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
370 shr $16, %eax
371 mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
372 mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
373 ret
374
375/*
349 * setup_idt 376 * setup_idt
350 * 377 *
351 * sets up a idt with 256 entries pointing to 378 * sets up a idt with 256 entries pointing to
@@ -465,6 +492,33 @@ ignore_int:
465#endif 492#endif
466 iret 493 iret
467 494
495#ifdef CONFIG_PARAVIRT
496startup_paravirt:
497 cld
498 movl $(init_thread_union+THREAD_SIZE),%esp
499
500 /* We take pains to preserve all the regs. */
501 pushl %edx
502 pushl %ecx
503 pushl %eax
504
505 /* paravirt.o is last in link, and that probe fn never returns */
506 pushl $__start_paravirtprobe
5071:
508 movl 0(%esp), %eax
509 pushl (%eax)
510 movl 8(%esp), %eax
511 call *(%esp)
512 popl %eax
513
514 movl 4(%esp), %eax
515 movl 8(%esp), %ecx
516 movl 12(%esp), %edx
517
518 addl $4, (%esp)
519 jmp 1b
520#endif
521
468/* 522/*
469 * Real beginning of normal "text" segment 523 * Real beginning of normal "text" segment
470 */ 524 */
@@ -484,6 +538,8 @@ ENTRY(empty_zero_page)
484 * This starts the data section. 538 * This starts the data section.
485 */ 539 */
486.data 540.data
541ENTRY(start_pda)
542 .long boot_pda
487 543
488ENTRY(stack_start) 544ENTRY(stack_start)
489 .long init_thread_union+THREAD_SIZE 545 .long init_thread_union+THREAD_SIZE
@@ -525,7 +581,7 @@ idt_descr:
525 581
526# boot GDT descriptor (later on used by CPU#0): 582# boot GDT descriptor (later on used by CPU#0):
527 .word 0 # 32 bit align gdt_desc.address 583 .word 0 # 32 bit align gdt_desc.address
528cpu_gdt_descr: 584ENTRY(cpu_gdt_descr)
529 .word GDT_ENTRIES*8-1 585 .word GDT_ENTRIES*8-1
530 .long cpu_gdt_table 586 .long cpu_gdt_table
531 587
@@ -584,8 +640,8 @@ ENTRY(cpu_gdt_table)
584 .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */ 640 .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */
585 .quad 0x004092000000ffff /* 0xc8 APM DS data */ 641 .quad 0x004092000000ffff /* 0xc8 APM DS data */
586 642
587 .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */ 643 .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */
588 .quad 0x0000000000000000 /* 0xd8 - unused */ 644 .quad 0x00cf92000000ffff /* 0xd8 - PDA */
589 .quad 0x0000000000000000 /* 0xe0 - unused */ 645 .quad 0x0000000000000000 /* 0xe0 - unused */
590 .quad 0x0000000000000000 /* 0xe8 - unused */ 646 .quad 0x0000000000000000 /* 0xe8 - unused */
591 .quad 0x0000000000000000 /* 0xf0 - unused */ 647 .quad 0x0000000000000000 /* 0xf0 - unused */
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 17647a530b2f..45a8685bb60b 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -34,6 +34,7 @@ static int __init init_hpet_clocksource(void)
34 unsigned long hpet_period; 34 unsigned long hpet_period;
35 void __iomem* hpet_base; 35 void __iomem* hpet_base;
36 u64 tmp; 36 u64 tmp;
37 int err;
37 38
38 if (!is_hpet_enabled()) 39 if (!is_hpet_enabled())
39 return -ENODEV; 40 return -ENODEV;
@@ -61,7 +62,11 @@ static int __init init_hpet_clocksource(void)
61 do_div(tmp, FSEC_PER_NSEC); 62 do_div(tmp, FSEC_PER_NSEC);
62 clocksource_hpet.mult = (u32)tmp; 63 clocksource_hpet.mult = (u32)tmp;
63 64
64 return clocksource_register(&clocksource_hpet); 65 err = clocksource_register(&clocksource_hpet);
66 if (err)
67 iounmap(hpet_base);
68
69 return err;
65} 70}
66 71
67module_init(init_hpet_clocksource); 72module_init(init_hpet_clocksource);
diff --git a/arch/i386/kernel/i8253.c b/arch/i386/kernel/i8253.c
index 477b24daff53..9a0060b92e32 100644
--- a/arch/i386/kernel/i8253.c
+++ b/arch/i386/kernel/i8253.c
@@ -109,7 +109,7 @@ static struct clocksource clocksource_pit = {
109 109
110static int __init init_pit_clocksource(void) 110static int __init init_pit_clocksource(void)
111{ 111{
112 if (num_possible_cpus() > 4) /* PIT does not scale! */ 112 if (num_possible_cpus() > 1) /* PIT does not scale! */
113 return 0; 113 return 0;
114 114
115 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); 115 clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index d53eafb6daa7..c8d45821c788 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -113,7 +113,8 @@ void make_8259A_irq(unsigned int irq)
113{ 113{
114 disable_irq_nosync(irq); 114 disable_irq_nosync(irq);
115 io_apic_irqs &= ~(1<<irq); 115 io_apic_irqs &= ~(1<<irq);
116 set_irq_chip_and_handler(irq, &i8259A_chip, handle_level_irq); 116 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
117 "XT");
117 enable_irq(irq); 118 enable_irq(irq);
118} 119}
119 120
@@ -369,8 +370,8 @@ void __init init_ISA_irqs (void)
369 /* 370 /*
370 * 16 old-style INTA-cycle interrupts: 371 * 16 old-style INTA-cycle interrupts:
371 */ 372 */
372 set_irq_chip_and_handler(i, &i8259A_chip, 373 set_irq_chip_and_handler_name(i, &i8259A_chip,
373 handle_level_irq); 374 handle_level_irq, "XT");
374 } else { 375 } else {
375 /* 376 /*
376 * 'high' PCI IRQs filled in on demand 377 * 'high' PCI IRQs filled in on demand
@@ -380,7 +381,10 @@ void __init init_ISA_irqs (void)
380 } 381 }
381} 382}
382 383
383void __init init_IRQ(void) 384/* Overridden in paravirt.c */
385void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
386
387void __init native_init_IRQ(void)
384{ 388{
385 int i; 389 int i;
386 390
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 27bceaf5ce40..2424cc9c7b3d 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -34,6 +34,7 @@
34#include <linux/pci.h> 34#include <linux/pci.h>
35#include <linux/msi.h> 35#include <linux/msi.h>
36#include <linux/htirq.h> 36#include <linux/htirq.h>
37#include <linux/freezer.h>
37 38
38#include <asm/io.h> 39#include <asm/io.h>
39#include <asm/smp.h> 40#include <asm/smp.h>
@@ -91,6 +92,46 @@ static struct irq_pin_list {
91 int apic, pin, next; 92 int apic, pin, next;
92} irq_2_pin[PIN_MAP_SIZE]; 93} irq_2_pin[PIN_MAP_SIZE];
93 94
95struct io_apic {
96 unsigned int index;
97 unsigned int unused[3];
98 unsigned int data;
99};
100
101static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
102{
103 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
104 + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK);
105}
106
107static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
108{
109 struct io_apic __iomem *io_apic = io_apic_base(apic);
110 writel(reg, &io_apic->index);
111 return readl(&io_apic->data);
112}
113
114static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
115{
116 struct io_apic __iomem *io_apic = io_apic_base(apic);
117 writel(reg, &io_apic->index);
118 writel(value, &io_apic->data);
119}
120
121/*
122 * Re-write a value: to be used for read-modify-write
123 * cycles where the read already set up the index register.
124 *
125 * Older SiS APIC requires we rewrite the index register
126 */
127static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
128{
129 volatile struct io_apic *io_apic = io_apic_base(apic);
130 if (sis_apic_bug)
131 writel(reg, &io_apic->index);
132 writel(value, &io_apic->data);
133}
134
94union entry_union { 135union entry_union {
95 struct { u32 w1, w2; }; 136 struct { u32 w1, w2; };
96 struct IO_APIC_route_entry entry; 137 struct IO_APIC_route_entry entry;
@@ -107,11 +148,39 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
107 return eu.entry; 148 return eu.entry;
108} 149}
109 150
110static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 151/*
152 * When we write a new IO APIC routing entry, we need to write the high
153 * word first! If the mask bit in the low word is clear, we will enable
154 * the interrupt, and we need to make sure the entry is fully populated
155 * before that happens.
156 */
157static void
158__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
111{ 159{
112 unsigned long flags;
113 union entry_union eu; 160 union entry_union eu;
114 eu.entry = e; 161 eu.entry = e;
162 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
163 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
164}
165
166static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
167{
168 unsigned long flags;
169 spin_lock_irqsave(&ioapic_lock, flags);
170 __ioapic_write_entry(apic, pin, e);
171 spin_unlock_irqrestore(&ioapic_lock, flags);
172}
173
174/*
175 * When we mask an IO APIC routing entry, we need to write the low
176 * word first, in order to set the mask bit before we change the
177 * high bits!
178 */
179static void ioapic_mask_entry(int apic, int pin)
180{
181 unsigned long flags;
182 union entry_union eu = { .entry.mask = 1 };
183
115 spin_lock_irqsave(&ioapic_lock, flags); 184 spin_lock_irqsave(&ioapic_lock, flags);
116 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 185 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
117 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 186 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
@@ -234,9 +303,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
234 /* 303 /*
235 * Disable it in the IO-APIC irq-routing table: 304 * Disable it in the IO-APIC irq-routing table:
236 */ 305 */
237 memset(&entry, 0, sizeof(entry)); 306 ioapic_mask_entry(apic, pin);
238 entry.mask = 1;
239 ioapic_write_entry(apic, pin, entry);
240} 307}
241 308
242static void clear_IO_APIC (void) 309static void clear_IO_APIC (void)
@@ -776,8 +843,7 @@ static int __init find_isa_irq_pin(int irq, int type)
776 843
777 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 844 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
778 mp_bus_id_to_type[lbus] == MP_BUS_EISA || 845 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
779 mp_bus_id_to_type[lbus] == MP_BUS_MCA || 846 mp_bus_id_to_type[lbus] == MP_BUS_MCA
780 mp_bus_id_to_type[lbus] == MP_BUS_NEC98
781 ) && 847 ) &&
782 (mp_irqs[i].mpc_irqtype == type) && 848 (mp_irqs[i].mpc_irqtype == type) &&
783 (mp_irqs[i].mpc_srcbusirq == irq)) 849 (mp_irqs[i].mpc_srcbusirq == irq))
@@ -796,8 +862,7 @@ static int __init find_isa_irq_apic(int irq, int type)
796 862
797 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 863 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
798 mp_bus_id_to_type[lbus] == MP_BUS_EISA || 864 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
799 mp_bus_id_to_type[lbus] == MP_BUS_MCA || 865 mp_bus_id_to_type[lbus] == MP_BUS_MCA
800 mp_bus_id_to_type[lbus] == MP_BUS_NEC98
801 ) && 866 ) &&
802 (mp_irqs[i].mpc_irqtype == type) && 867 (mp_irqs[i].mpc_irqtype == type) &&
803 (mp_irqs[i].mpc_srcbusirq == irq)) 868 (mp_irqs[i].mpc_srcbusirq == irq))
@@ -927,12 +992,6 @@ static int EISA_ELCR(unsigned int irq)
927#define default_MCA_trigger(idx) (1) 992#define default_MCA_trigger(idx) (1)
928#define default_MCA_polarity(idx) (0) 993#define default_MCA_polarity(idx) (0)
929 994
930/* NEC98 interrupts are always polarity zero edge triggered,
931 * when listed as conforming in the MP table. */
932
933#define default_NEC98_trigger(idx) (0)
934#define default_NEC98_polarity(idx) (0)
935
936static int __init MPBIOS_polarity(int idx) 995static int __init MPBIOS_polarity(int idx)
937{ 996{
938 int bus = mp_irqs[idx].mpc_srcbus; 997 int bus = mp_irqs[idx].mpc_srcbus;
@@ -967,11 +1026,6 @@ static int __init MPBIOS_polarity(int idx)
967 polarity = default_MCA_polarity(idx); 1026 polarity = default_MCA_polarity(idx);
968 break; 1027 break;
969 } 1028 }
970 case MP_BUS_NEC98: /* NEC 98 pin */
971 {
972 polarity = default_NEC98_polarity(idx);
973 break;
974 }
975 default: 1029 default:
976 { 1030 {
977 printk(KERN_WARNING "broken BIOS!!\n"); 1031 printk(KERN_WARNING "broken BIOS!!\n");
@@ -1041,11 +1095,6 @@ static int MPBIOS_trigger(int idx)
1041 trigger = default_MCA_trigger(idx); 1095 trigger = default_MCA_trigger(idx);
1042 break; 1096 break;
1043 } 1097 }
1044 case MP_BUS_NEC98: /* NEC 98 pin */
1045 {
1046 trigger = default_NEC98_trigger(idx);
1047 break;
1048 }
1049 default: 1098 default:
1050 { 1099 {
1051 printk(KERN_WARNING "broken BIOS!!\n"); 1100 printk(KERN_WARNING "broken BIOS!!\n");
@@ -1107,7 +1156,6 @@ static int pin_2_irq(int idx, int apic, int pin)
1107 case MP_BUS_ISA: /* ISA pin */ 1156 case MP_BUS_ISA: /* ISA pin */
1108 case MP_BUS_EISA: 1157 case MP_BUS_EISA:
1109 case MP_BUS_MCA: 1158 case MP_BUS_MCA:
1110 case MP_BUS_NEC98:
1111 { 1159 {
1112 irq = mp_irqs[idx].mpc_srcbusirq; 1160 irq = mp_irqs[idx].mpc_srcbusirq;
1113 break; 1161 break;
@@ -1175,7 +1223,7 @@ static inline int IO_APIC_irq_trigger(int irq)
1175} 1223}
1176 1224
1177/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ 1225/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1178u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; 1226static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
1179 1227
1180static int __assign_irq_vector(int irq) 1228static int __assign_irq_vector(int irq)
1181{ 1229{
@@ -1225,11 +1273,13 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1225{ 1273{
1226 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1274 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1227 trigger == IOAPIC_LEVEL) 1275 trigger == IOAPIC_LEVEL)
1228 set_irq_chip_and_handler(irq, &ioapic_chip, 1276 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1229 handle_fasteoi_irq); 1277 handle_fasteoi_irq, "fasteoi");
1230 else 1278 else {
1231 set_irq_chip_and_handler(irq, &ioapic_chip, 1279 irq_desc[irq].status |= IRQ_DELAYED_DISABLE;
1232 handle_edge_irq); 1280 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1281 handle_edge_irq, "edge");
1282 }
1233 set_intr_gate(vector, interrupt[irq]); 1283 set_intr_gate(vector, interrupt[irq]);
1234} 1284}
1235 1285
@@ -1298,8 +1348,8 @@ static void __init setup_IO_APIC_irqs(void)
1298 if (!apic && (irq < 16)) 1348 if (!apic && (irq < 16))
1299 disable_8259A_irq(irq); 1349 disable_8259A_irq(irq);
1300 } 1350 }
1301 ioapic_write_entry(apic, pin, entry);
1302 spin_lock_irqsave(&ioapic_lock, flags); 1351 spin_lock_irqsave(&ioapic_lock, flags);
1352 __ioapic_write_entry(apic, pin, entry);
1303 set_native_irq_info(irq, TARGET_CPUS); 1353 set_native_irq_info(irq, TARGET_CPUS);
1304 spin_unlock_irqrestore(&ioapic_lock, flags); 1354 spin_unlock_irqrestore(&ioapic_lock, flags);
1305 } 1355 }
@@ -1864,6 +1914,15 @@ static void __init setup_ioapic_ids_from_mpc(void)
1864static void __init setup_ioapic_ids_from_mpc(void) { } 1914static void __init setup_ioapic_ids_from_mpc(void) { }
1865#endif 1915#endif
1866 1916
1917static int no_timer_check __initdata;
1918
1919static int __init notimercheck(char *s)
1920{
1921 no_timer_check = 1;
1922 return 1;
1923}
1924__setup("no_timer_check", notimercheck);
1925
1867/* 1926/*
1868 * There is a nasty bug in some older SMP boards, their mptable lies 1927 * There is a nasty bug in some older SMP boards, their mptable lies
1869 * about the timer IRQ. We do the following to work around the situation: 1928 * about the timer IRQ. We do the following to work around the situation:
@@ -1872,10 +1931,13 @@ static void __init setup_ioapic_ids_from_mpc(void) { }
1872 * - if this function detects that timer IRQs are defunct, then we fall 1931 * - if this function detects that timer IRQs are defunct, then we fall
1873 * back to ISA timer IRQs 1932 * back to ISA timer IRQs
1874 */ 1933 */
1875static int __init timer_irq_works(void) 1934int __init timer_irq_works(void)
1876{ 1935{
1877 unsigned long t1 = jiffies; 1936 unsigned long t1 = jiffies;
1878 1937
1938 if (no_timer_check)
1939 return 1;
1940
1879 local_irq_enable(); 1941 local_irq_enable();
1880 /* Let ten ticks pass... */ 1942 /* Let ten ticks pass... */
1881 mdelay((10 * 1000) / HZ); 1943 mdelay((10 * 1000) / HZ);
@@ -2099,9 +2161,15 @@ static inline void unlock_ExtINT_logic(void)
2099 unsigned char save_control, save_freq_select; 2161 unsigned char save_control, save_freq_select;
2100 2162
2101 pin = find_isa_irq_pin(8, mp_INT); 2163 pin = find_isa_irq_pin(8, mp_INT);
2164 if (pin == -1) {
2165 WARN_ON_ONCE(1);
2166 return;
2167 }
2102 apic = find_isa_irq_apic(8, mp_INT); 2168 apic = find_isa_irq_apic(8, mp_INT);
2103 if (pin == -1) 2169 if (apic == -1) {
2170 WARN_ON_ONCE(1);
2104 return; 2171 return;
2172 }
2105 2173
2106 entry0 = ioapic_read_entry(apic, pin); 2174 entry0 = ioapic_read_entry(apic, pin);
2107 clear_IO_APIC_pin(apic, pin); 2175 clear_IO_APIC_pin(apic, pin);
@@ -2146,7 +2214,7 @@ int timer_uses_ioapic_pin_0;
2146 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast 2214 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2147 * fanatically on his truly buggy board. 2215 * fanatically on his truly buggy board.
2148 */ 2216 */
2149static inline void check_timer(void) 2217static inline void __init check_timer(void)
2150{ 2218{
2151 int apic1, pin1, apic2, pin2; 2219 int apic1, pin1, apic2, pin2;
2152 int vector; 2220 int vector;
@@ -2235,7 +2303,8 @@ static inline void check_timer(void)
2235 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 2303 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
2236 2304
2237 disable_8259A_irq(0); 2305 disable_8259A_irq(0);
2238 set_irq_chip_and_handler(0, &lapic_chip, handle_fasteoi_irq); 2306 set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
2307 "fasteio");
2239 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ 2308 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2240 enable_8259A_irq(0); 2309 enable_8259A_irq(0);
2241 2310
@@ -2416,7 +2485,7 @@ device_initcall(ioapic_init_sysfs);
2416int create_irq(void) 2485int create_irq(void)
2417{ 2486{
2418 /* Allocate an unused irq */ 2487 /* Allocate an unused irq */
2419 int irq, new, vector; 2488 int irq, new, vector = 0;
2420 unsigned long flags; 2489 unsigned long flags;
2421 2490
2422 irq = -ENOSPC; 2491 irq = -ENOSPC;
@@ -2541,7 +2610,8 @@ int arch_setup_msi_irq(unsigned int irq, struct pci_dev *dev)
2541 2610
2542 write_msi_msg(irq, &msg); 2611 write_msi_msg(irq, &msg);
2543 2612
2544 set_irq_chip_and_handler(irq, &msi_chip, handle_edge_irq); 2613 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
2614 "edge");
2545 2615
2546 return 0; 2616 return 0;
2547} 2617}
@@ -2562,18 +2632,16 @@ void arch_teardown_msi_irq(unsigned int irq)
2562 2632
2563static void target_ht_irq(unsigned int irq, unsigned int dest) 2633static void target_ht_irq(unsigned int irq, unsigned int dest)
2564{ 2634{
2565 u32 low, high; 2635 struct ht_irq_msg msg;
2566 low = read_ht_irq_low(irq); 2636 fetch_ht_irq_msg(irq, &msg);
2567 high = read_ht_irq_high(irq);
2568 2637
2569 low &= ~(HT_IRQ_LOW_DEST_ID_MASK); 2638 msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
2570 high &= ~(HT_IRQ_HIGH_DEST_ID_MASK); 2639 msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
2571 2640
2572 low |= HT_IRQ_LOW_DEST_ID(dest); 2641 msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
2573 high |= HT_IRQ_HIGH_DEST_ID(dest); 2642 msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
2574 2643
2575 write_ht_irq_low(irq, low); 2644 write_ht_irq_msg(irq, &msg);
2576 write_ht_irq_high(irq, high);
2577} 2645}
2578 2646
2579static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) 2647static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
@@ -2611,7 +2679,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2611 2679
2612 vector = assign_irq_vector(irq); 2680 vector = assign_irq_vector(irq);
2613 if (vector >= 0) { 2681 if (vector >= 0) {
2614 u32 low, high; 2682 struct ht_irq_msg msg;
2615 unsigned dest; 2683 unsigned dest;
2616 cpumask_t tmp; 2684 cpumask_t tmp;
2617 2685
@@ -2619,9 +2687,10 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2619 cpu_set(vector >> 8, tmp); 2687 cpu_set(vector >> 8, tmp);
2620 dest = cpu_mask_to_apicid(tmp); 2688 dest = cpu_mask_to_apicid(tmp);
2621 2689
2622 high = HT_IRQ_HIGH_DEST_ID(dest); 2690 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
2623 2691
2624 low = HT_IRQ_LOW_BASE | 2692 msg.address_lo =
2693 HT_IRQ_LOW_BASE |
2625 HT_IRQ_LOW_DEST_ID(dest) | 2694 HT_IRQ_LOW_DEST_ID(dest) |
2626 HT_IRQ_LOW_VECTOR(vector) | 2695 HT_IRQ_LOW_VECTOR(vector) |
2627 ((INT_DEST_MODE == 0) ? 2696 ((INT_DEST_MODE == 0) ?
@@ -2633,10 +2702,10 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2633 HT_IRQ_LOW_MT_ARBITRATED) | 2702 HT_IRQ_LOW_MT_ARBITRATED) |
2634 HT_IRQ_LOW_IRQ_MASKED; 2703 HT_IRQ_LOW_IRQ_MASKED;
2635 2704
2636 write_ht_irq_low(irq, low); 2705 write_ht_irq_msg(irq, &msg);
2637 write_ht_irq_high(irq, high);
2638 2706
2639 set_irq_chip_and_handler(irq, &ht_irq_chip, handle_edge_irq); 2707 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2708 handle_edge_irq, "edge");
2640 } 2709 }
2641 return vector; 2710 return vector;
2642} 2711}
@@ -2793,8 +2862,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
2793 if (!ioapic && (irq < 16)) 2862 if (!ioapic && (irq < 16))
2794 disable_8259A_irq(irq); 2863 disable_8259A_irq(irq);
2795 2864
2796 ioapic_write_entry(ioapic, pin, entry);
2797 spin_lock_irqsave(&ioapic_lock, flags); 2865 spin_lock_irqsave(&ioapic_lock, flags);
2866 __ioapic_write_entry(ioapic, pin, entry);
2798 set_native_irq_info(irq, TARGET_CPUS); 2867 set_native_irq_info(irq, TARGET_CPUS);
2799 spin_unlock_irqrestore(&ioapic_lock, flags); 2868 spin_unlock_irqrestore(&ioapic_lock, flags);
2800 2869
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 8cfc7dbec7b9..3201d421090a 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -258,7 +258,7 @@ int show_interrupts(struct seq_file *p, void *v)
258 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); 258 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
259#endif 259#endif
260 seq_printf(p, " %8s", irq_desc[i].chip->name); 260 seq_printf(p, " %8s", irq_desc[i].chip->name);
261 seq_printf(p, "-%s", handle_irq_name(irq_desc[i].handle_irq)); 261 seq_printf(p, "-%-8s", irq_desc[i].name);
262 seq_printf(p, " %s", action->name); 262 seq_printf(p, " %s", action->name);
263 263
264 for (action=action->next; action; action = action->next) 264 for (action=action->next; action; action = action->next)
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index d98e44b16fe2..af1d53344993 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -184,7 +184,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
184void __kprobes arch_remove_kprobe(struct kprobe *p) 184void __kprobes arch_remove_kprobe(struct kprobe *p)
185{ 185{
186 mutex_lock(&kprobe_mutex); 186 mutex_lock(&kprobe_mutex);
187 free_insn_slot(p->ainsn.insn); 187 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
188 mutex_unlock(&kprobe_mutex); 188 mutex_unlock(&kprobe_mutex);
189} 189}
190 190
@@ -333,7 +333,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
333 return 1; 333 return 1;
334 334
335ss_probe: 335ss_probe:
336#ifndef CONFIG_PREEMPT 336#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
337 if (p->ainsn.boostable == 1 && !p->post_handler){ 337 if (p->ainsn.boostable == 1 && !p->post_handler){
338 /* Boost up -- we can execute copied instructions directly */ 338 /* Boost up -- we can execute copied instructions directly */
339 reset_current_kprobe(); 339 reset_current_kprobe();
@@ -361,8 +361,11 @@ no_kprobe:
361 asm volatile ( ".global kretprobe_trampoline\n" 361 asm volatile ( ".global kretprobe_trampoline\n"
362 "kretprobe_trampoline: \n" 362 "kretprobe_trampoline: \n"
363 " pushf\n" 363 " pushf\n"
364 /* skip cs, eip, orig_eax, es, ds */ 364 /* skip cs, eip, orig_eax */
365 " subl $20, %esp\n" 365 " subl $12, %esp\n"
366 " pushl %gs\n"
367 " pushl %ds\n"
368 " pushl %es\n"
366 " pushl %eax\n" 369 " pushl %eax\n"
367 " pushl %ebp\n" 370 " pushl %ebp\n"
368 " pushl %edi\n" 371 " pushl %edi\n"
@@ -373,10 +376,10 @@ no_kprobe:
373 " movl %esp, %eax\n" 376 " movl %esp, %eax\n"
374 " call trampoline_handler\n" 377 " call trampoline_handler\n"
375 /* move eflags to cs */ 378 /* move eflags to cs */
376 " movl 48(%esp), %edx\n" 379 " movl 52(%esp), %edx\n"
377 " movl %edx, 44(%esp)\n" 380 " movl %edx, 48(%esp)\n"
378 /* save true return address on eflags */ 381 /* save true return address on eflags */
379 " movl %eax, 48(%esp)\n" 382 " movl %eax, 52(%esp)\n"
380 " popl %ebx\n" 383 " popl %ebx\n"
381 " popl %ecx\n" 384 " popl %ecx\n"
382 " popl %edx\n" 385 " popl %edx\n"
@@ -384,8 +387,8 @@ no_kprobe:
384 " popl %edi\n" 387 " popl %edi\n"
385 " popl %ebp\n" 388 " popl %ebp\n"
386 " popl %eax\n" 389 " popl %eax\n"
387 /* skip eip, orig_eax, es, ds */ 390 /* skip eip, orig_eax, es, ds, gs */
388 " addl $16, %esp\n" 391 " addl $20, %esp\n"
389 " popf\n" 392 " popf\n"
390 " ret\n"); 393 " ret\n");
391} 394}
@@ -404,6 +407,10 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
404 INIT_HLIST_HEAD(&empty_rp); 407 INIT_HLIST_HEAD(&empty_rp);
405 spin_lock_irqsave(&kretprobe_lock, flags); 408 spin_lock_irqsave(&kretprobe_lock, flags);
406 head = kretprobe_inst_table_head(current); 409 head = kretprobe_inst_table_head(current);
410 /* fixup registers */
411 regs->xcs = __KERNEL_CS;
412 regs->eip = trampoline_address;
413 regs->orig_eax = 0xffffffff;
407 414
408 /* 415 /*
409 * It is possible to have multiple instances associated with a given 416 * It is possible to have multiple instances associated with a given
@@ -425,6 +432,7 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
425 432
426 if (ri->rp && ri->rp->handler){ 433 if (ri->rp && ri->rp->handler){
427 __get_cpu_var(current_kprobe) = &ri->rp->kp; 434 __get_cpu_var(current_kprobe) = &ri->rp->kp;
435 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
428 ri->rp->handler(ri, regs); 436 ri->rp->handler(ri, regs);
429 __get_cpu_var(current_kprobe) = NULL; 437 __get_cpu_var(current_kprobe) = NULL;
430 } 438 }
diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c
index 445211eb2d57..b410e5fb034f 100644
--- a/arch/i386/kernel/ldt.c
+++ b/arch/i386/kernel/ldt.c
@@ -160,16 +160,14 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount)
160{ 160{
161 int err; 161 int err;
162 unsigned long size; 162 unsigned long size;
163 void *address;
164 163
165 err = 0; 164 err = 0;
166 address = &default_ldt[0];
167 size = 5*sizeof(struct desc_struct); 165 size = 5*sizeof(struct desc_struct);
168 if (size > bytecount) 166 if (size > bytecount)
169 size = bytecount; 167 size = bytecount;
170 168
171 err = size; 169 err = size;
172 if (copy_to_user(ptr, address, size)) 170 if (clear_user(ptr, size))
173 err = -EFAULT; 171 err = -EFAULT;
174 172
175 return err; 173 return err;
diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c
index eb57a851789d..b83672b89527 100644
--- a/arch/i386/kernel/mca.c
+++ b/arch/i386/kernel/mca.c
@@ -283,10 +283,9 @@ static int __init mca_init(void)
283 bus->f.mca_transform_memory = mca_dummy_transform_memory; 283 bus->f.mca_transform_memory = mca_dummy_transform_memory;
284 284
285 /* get the motherboard device */ 285 /* get the motherboard device */
286 mca_dev = kmalloc(sizeof(struct mca_device), GFP_KERNEL); 286 mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
287 if(unlikely(!mca_dev)) 287 if(unlikely(!mca_dev))
288 goto out_nomem; 288 goto out_nomem;
289 memset(mca_dev, 0, sizeof(struct mca_device));
290 289
291 /* 290 /*
292 * We do not expect many MCA interrupts during initialization, 291 * We do not expect many MCA interrupts during initialization,
@@ -310,11 +309,9 @@ static int __init mca_init(void)
310 mca_dev->slot = MCA_MOTHERBOARD; 309 mca_dev->slot = MCA_MOTHERBOARD;
311 mca_register_device(MCA_PRIMARY_BUS, mca_dev); 310 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
312 311
313 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 312 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
314 if(unlikely(!mca_dev)) 313 if(unlikely(!mca_dev))
315 goto out_unlock_nomem; 314 goto out_unlock_nomem;
316 memset(mca_dev, 0, sizeof(struct mca_device));
317
318 315
319 /* Put motherboard into video setup mode, read integrated video 316 /* Put motherboard into video setup mode, read integrated video
320 * POS registers, and turn motherboard setup off. 317 * POS registers, and turn motherboard setup off.
@@ -349,10 +346,9 @@ static int __init mca_init(void)
349 } 346 }
350 if(which_scsi) { 347 if(which_scsi) {
351 /* found a scsi card */ 348 /* found a scsi card */
352 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 349 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
353 if(unlikely(!mca_dev)) 350 if(unlikely(!mca_dev))
354 goto out_unlock_nomem; 351 goto out_unlock_nomem;
355 memset(mca_dev, 0, sizeof(struct mca_device));
356 352
357 for(j = 0; j < 8; j++) 353 for(j = 0; j < 8; j++)
358 mca_dev->pos[j] = pos[j]; 354 mca_dev->pos[j] = pos[j];
@@ -378,10 +374,9 @@ static int __init mca_init(void)
378 if(!mca_read_and_store_pos(pos)) 374 if(!mca_read_and_store_pos(pos))
379 continue; 375 continue;
380 376
381 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 377 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
382 if(unlikely(!mca_dev)) 378 if(unlikely(!mca_dev))
383 goto out_unlock_nomem; 379 goto out_unlock_nomem;
384 memset(mca_dev, 0, sizeof(struct mca_device));
385 380
386 for(j=0; j<8; j++) 381 for(j=0; j<8; j++)
387 mca_dev->pos[j]=pos[j]; 382 mca_dev->pos[j]=pos[j];
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index c4d0291b519f..972346604f9d 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -577,7 +577,7 @@ static void microcode_init_cpu(int cpu)
577 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 577 set_cpus_allowed(current, cpumask_of_cpu(cpu));
578 mutex_lock(&microcode_mutex); 578 mutex_lock(&microcode_mutex);
579 collect_cpu_info(cpu); 579 collect_cpu_info(cpu);
580 if (uci->valid) 580 if (uci->valid && system_state == SYSTEM_RUNNING)
581 cpu_request_microcode(cpu); 581 cpu_request_microcode(cpu);
582 mutex_unlock(&microcode_mutex); 582 mutex_unlock(&microcode_mutex);
583 set_cpus_allowed(current, old); 583 set_cpus_allowed(current, old);
@@ -703,7 +703,6 @@ static struct sysdev_driver mc_sysdev_driver = {
703 .resume = mc_sysdev_resume, 703 .resume = mc_sysdev_resume,
704}; 704};
705 705
706#ifdef CONFIG_HOTPLUG_CPU
707static __cpuinit int 706static __cpuinit int
708mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) 707mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
709{ 708{
@@ -726,7 +725,6 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
726static struct notifier_block mc_cpu_notifier = { 725static struct notifier_block mc_cpu_notifier = {
727 .notifier_call = mc_cpu_callback, 726 .notifier_call = mc_cpu_callback,
728}; 727};
729#endif
730 728
731static int __init microcode_init (void) 729static int __init microcode_init (void)
732{ 730{
diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c
index 470cf97e7cd3..3db0a5442eb1 100644
--- a/arch/i386/kernel/module.c
+++ b/arch/i386/kernel/module.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/bug.h>
24 25
25#if 0 26#if 0
26#define DEBUGP printk 27#define DEBUGP printk
@@ -108,7 +109,8 @@ int module_finalize(const Elf_Ehdr *hdr,
108 const Elf_Shdr *sechdrs, 109 const Elf_Shdr *sechdrs,
109 struct module *me) 110 struct module *me)
110{ 111{
111 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 112 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
113 *para = NULL;
112 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
113 115
114 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -118,6 +120,8 @@ int module_finalize(const Elf_Ehdr *hdr,
118 alt = s; 120 alt = s;
119 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
120 locks= s; 122 locks= s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s;
121 } 125 }
122 126
123 if (alt) { 127 if (alt) {
@@ -132,10 +136,17 @@ int module_finalize(const Elf_Ehdr *hdr,
132 lseg, lseg + locks->sh_size, 136 lseg, lseg + locks->sh_size,
133 tseg, tseg + text->sh_size); 137 tseg, tseg + text->sh_size);
134 } 138 }
135 return 0; 139
140 if (para) {
141 void *pseg = (void *)para->sh_addr;
142 apply_paravirt(pseg, pseg + para->sh_size);
143 }
144
145 return module_bug_finalize(hdr, sechdrs, me);
136} 146}
137 147
138void module_arch_cleanup(struct module *mod) 148void module_arch_cleanup(struct module *mod)
139{ 149{
140 alternatives_smp_module_del(mod); 150 alternatives_smp_module_del(mod);
151 module_bug_cleanup(mod);
141} 152}
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 442aaf8c77eb..2ce67228dff8 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -249,8 +249,6 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
249 mp_current_pci_id++; 249 mp_current_pci_id++;
250 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { 250 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
251 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; 251 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
252 } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
253 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
254 } else { 252 } else {
255 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 253 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
256 } 254 }
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index d535cdbbfd25..4a472a17d1c6 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -172,7 +172,7 @@ static ssize_t msr_read(struct file *file, char __user * buf,
172 u32 __user *tmp = (u32 __user *) buf; 172 u32 __user *tmp = (u32 __user *) buf;
173 u32 data[2]; 173 u32 data[2];
174 u32 reg = *ppos; 174 u32 reg = *ppos;
175 int cpu = iminor(file->f_dentry->d_inode); 175 int cpu = iminor(file->f_path.dentry->d_inode);
176 int err; 176 int err;
177 177
178 if (count % 8) 178 if (count % 8)
@@ -195,15 +195,14 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
195{ 195{
196 const u32 __user *tmp = (const u32 __user *)buf; 196 const u32 __user *tmp = (const u32 __user *)buf;
197 u32 data[2]; 197 u32 data[2];
198 size_t rv;
199 u32 reg = *ppos; 198 u32 reg = *ppos;
200 int cpu = iminor(file->f_dentry->d_inode); 199 int cpu = iminor(file->f_path.dentry->d_inode);
201 int err; 200 int err;
202 201
203 if (count % 8) 202 if (count % 8)
204 return -EINVAL; /* Invalid chunk size */ 203 return -EINVAL; /* Invalid chunk size */
205 204
206 for (rv = 0; count; count -= 8) { 205 for (; count; count -= 8) {
207 if (copy_from_user(&data, tmp, 8)) 206 if (copy_from_user(&data, tmp, 8))
208 return -EFAULT; 207 return -EFAULT;
209 err = do_wrmsr(cpu, reg, data[0], data[1]); 208 err = do_wrmsr(cpu, reg, data[0], data[1]);
@@ -217,7 +216,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
217 216
218static int msr_open(struct inode *inode, struct file *file) 217static int msr_open(struct inode *inode, struct file *file)
219{ 218{
220 unsigned int cpu = iminor(file->f_dentry->d_inode); 219 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
221 struct cpuinfo_x86 *c = &(cpu_data)[cpu]; 220 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
222 221
223 if (cpu >= NR_CPUS || !cpu_online(cpu)) 222 if (cpu >= NR_CPUS || !cpu_online(cpu))
@@ -239,18 +238,17 @@ static struct file_operations msr_fops = {
239 .open = msr_open, 238 .open = msr_open,
240}; 239};
241 240
242static int msr_class_device_create(int i) 241static int msr_device_create(int i)
243{ 242{
244 int err = 0; 243 int err = 0;
245 struct class_device *class_err; 244 struct device *dev;
246 245
247 class_err = class_device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), NULL, "msr%d",i); 246 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i);
248 if (IS_ERR(class_err)) 247 if (IS_ERR(dev))
249 err = PTR_ERR(class_err); 248 err = PTR_ERR(dev);
250 return err; 249 return err;
251} 250}
252 251
253#ifdef CONFIG_HOTPLUG_CPU
254static int msr_class_cpu_callback(struct notifier_block *nfb, 252static int msr_class_cpu_callback(struct notifier_block *nfb,
255 unsigned long action, void *hcpu) 253 unsigned long action, void *hcpu)
256{ 254{
@@ -258,10 +256,10 @@ static int msr_class_cpu_callback(struct notifier_block *nfb,
258 256
259 switch (action) { 257 switch (action) {
260 case CPU_ONLINE: 258 case CPU_ONLINE:
261 msr_class_device_create(cpu); 259 msr_device_create(cpu);
262 break; 260 break;
263 case CPU_DEAD: 261 case CPU_DEAD:
264 class_device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); 262 device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
265 break; 263 break;
266 } 264 }
267 return NOTIFY_OK; 265 return NOTIFY_OK;
@@ -271,7 +269,6 @@ static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
271{ 269{
272 .notifier_call = msr_class_cpu_callback, 270 .notifier_call = msr_class_cpu_callback,
273}; 271};
274#endif
275 272
276static int __init msr_init(void) 273static int __init msr_init(void)
277{ 274{
@@ -290,7 +287,7 @@ static int __init msr_init(void)
290 goto out_chrdev; 287 goto out_chrdev;
291 } 288 }
292 for_each_online_cpu(i) { 289 for_each_online_cpu(i) {
293 err = msr_class_device_create(i); 290 err = msr_device_create(i);
294 if (err != 0) 291 if (err != 0)
295 goto out_class; 292 goto out_class;
296 } 293 }
@@ -302,7 +299,7 @@ static int __init msr_init(void)
302out_class: 299out_class:
303 i = 0; 300 i = 0;
304 for_each_online_cpu(i) 301 for_each_online_cpu(i)
305 class_device_destroy(msr_class, MKDEV(MSR_MAJOR, i)); 302 device_destroy(msr_class, MKDEV(MSR_MAJOR, i));
306 class_destroy(msr_class); 303 class_destroy(msr_class);
307out_chrdev: 304out_chrdev:
308 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 305 unregister_chrdev(MSR_MAJOR, "cpu/msr");
@@ -314,7 +311,7 @@ static void __exit msr_exit(void)
314{ 311{
315 int cpu = 0; 312 int cpu = 0;
316 for_each_online_cpu(cpu) 313 for_each_online_cpu(cpu)
317 class_device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); 314 device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
318 class_destroy(msr_class); 315 class_destroy(msr_class);
319 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 316 unregister_chrdev(MSR_MAJOR, "cpu/msr");
320 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 317 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 3e8e3adb0489..a5e34d655965 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -22,6 +22,7 @@
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/dmi.h> 23#include <linux/dmi.h>
24#include <linux/kprobes.h> 24#include <linux/kprobes.h>
25#include <linux/cpumask.h>
25 26
26#include <asm/smp.h> 27#include <asm/smp.h>
27#include <asm/nmi.h> 28#include <asm/nmi.h>
@@ -42,6 +43,8 @@ int nmi_watchdog_enabled;
42static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner); 43static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
43static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); 44static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
44 45
46static cpumask_t backtrace_mask = CPU_MASK_NONE;
47
45/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's 48/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
46 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) 49 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
47 */ 50 */
@@ -192,6 +195,8 @@ static __cpuinit inline int nmi_known_cpu(void)
192 return 0; 195 return 0;
193} 196}
194 197
198static int endflag __initdata = 0;
199
195#ifdef CONFIG_SMP 200#ifdef CONFIG_SMP
196/* The performance counters used by NMI_LOCAL_APIC don't trigger when 201/* The performance counters used by NMI_LOCAL_APIC don't trigger when
197 * the CPU is idle. To make sure the NMI watchdog really ticks on all 202 * the CPU is idle. To make sure the NMI watchdog really ticks on all
@@ -199,7 +204,6 @@ static __cpuinit inline int nmi_known_cpu(void)
199 */ 204 */
200static __init void nmi_cpu_busy(void *data) 205static __init void nmi_cpu_busy(void *data)
201{ 206{
202 volatile int *endflag = data;
203 local_irq_enable_in_hardirq(); 207 local_irq_enable_in_hardirq();
204 /* Intentionally don't use cpu_relax here. This is 208 /* Intentionally don't use cpu_relax here. This is
205 to make sure that the performance counter really ticks, 209 to make sure that the performance counter really ticks,
@@ -207,23 +211,22 @@ static __init void nmi_cpu_busy(void *data)
207 pause instruction. On a real HT machine this is fine because 211 pause instruction. On a real HT machine this is fine because
208 all other CPUs are busy with "useless" delay loops and don't 212 all other CPUs are busy with "useless" delay loops and don't
209 care if they get somewhat less cycles. */ 213 care if they get somewhat less cycles. */
210 while (*endflag == 0) 214 while (endflag == 0)
211 barrier(); 215 mb();
212} 216}
213#endif 217#endif
214 218
215static int __init check_nmi_watchdog(void) 219static int __init check_nmi_watchdog(void)
216{ 220{
217 volatile int endflag = 0;
218 unsigned int *prev_nmi_count; 221 unsigned int *prev_nmi_count;
219 int cpu; 222 int cpu;
220 223
221 /* Enable NMI watchdog for newer systems. 224 /* Enable NMI watchdog for newer systems.
222 Actually it should be safe for most systems before 2004 too except 225 Probably safe on most older systems too, but let's be careful.
223 for some IBM systems that corrupt registers when NMI happens 226 IBM ThinkPads use INT10 inside SMM and that allows early NMI inside SMM
224 during SMM. Unfortunately we don't have more exact information 227 which hangs the system. Disable watchdog for all thinkpads */
225 on these and use this coarse check. */ 228 if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004 &&
226 if (nmi_watchdog == NMI_DEFAULT && dmi_get_year(DMI_BIOS_DATE) >= 2004) 229 !dmi_name_in_vendors("ThinkPad"))
227 nmi_watchdog = NMI_LOCAL_APIC; 230 nmi_watchdog = NMI_LOCAL_APIC;
228 231
229 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) 232 if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT))
@@ -867,14 +870,16 @@ static unsigned int
867 870
868void touch_nmi_watchdog (void) 871void touch_nmi_watchdog (void)
869{ 872{
870 int i; 873 if (nmi_watchdog > 0) {
874 unsigned cpu;
871 875
872 /* 876 /*
873 * Just reset the alert counters, (other CPUs might be 877 * Just reset the alert counters, (other CPUs might be
874 * spinning on locks we hold): 878 * spinning on locks we hold):
875 */ 879 */
876 for_each_possible_cpu(i) 880 for_each_present_cpu (cpu)
877 alert_counter[i] = 0; 881 alert_counter[cpu] = 0;
882 }
878 883
879 /* 884 /*
880 * Tickle the softlockup detector too: 885 * Tickle the softlockup detector too:
@@ -907,6 +912,16 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
907 touched = 1; 912 touched = 1;
908 } 913 }
909 914
915 if (cpu_isset(cpu, backtrace_mask)) {
916 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
917
918 spin_lock(&lock);
919 printk("NMI backtrace for cpu %d\n", cpu);
920 dump_stack();
921 spin_unlock(&lock);
922 cpu_clear(cpu, backtrace_mask);
923 }
924
910 sum = per_cpu(irq_stat, cpu).apic_timer_irqs; 925 sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
911 926
912 /* if the apic timer isn't firing, this cpu isn't doing much */ 927 /* if the apic timer isn't firing, this cpu isn't doing much */
@@ -1033,6 +1048,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1033 1048
1034#endif 1049#endif
1035 1050
1051void __trigger_all_cpu_backtrace(void)
1052{
1053 int i;
1054
1055 backtrace_mask = cpu_online_map;
1056 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
1057 for (i = 0; i < 10 * 1000; i++) {
1058 if (cpus_empty(backtrace_mask))
1059 break;
1060 mdelay(1);
1061 }
1062}
1063
1036EXPORT_SYMBOL(nmi_active); 1064EXPORT_SYMBOL(nmi_active);
1037EXPORT_SYMBOL(nmi_watchdog); 1065EXPORT_SYMBOL(nmi_watchdog);
1038EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); 1066EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
new file mode 100644
index 000000000000..3dceab5828f1
--- /dev/null
+++ b/arch/i386/kernel/paravirt.c
@@ -0,0 +1,569 @@
1/* Paravirtualization interfaces
2 Copyright (C) 2006 Rusty Russell IBM Corporation
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18#include <linux/errno.h>
19#include <linux/module.h>
20#include <linux/efi.h>
21#include <linux/bcd.h>
22#include <linux/start_kernel.h>
23
24#include <asm/bug.h>
25#include <asm/paravirt.h>
26#include <asm/desc.h>
27#include <asm/setup.h>
28#include <asm/arch_hooks.h>
29#include <asm/time.h>
30#include <asm/irq.h>
31#include <asm/delay.h>
32#include <asm/fixmap.h>
33#include <asm/apic.h>
34#include <asm/tlbflush.h>
35
36/* nop stub */
37static void native_nop(void)
38{
39}
40
41static void __init default_banner(void)
42{
43 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
44 paravirt_ops.name);
45}
46
47char *memory_setup(void)
48{
49 return paravirt_ops.memory_setup();
50}
51
52/* Simple instruction patching code. */
53#define DEF_NATIVE(name, code) \
54 extern const char start_##name[], end_##name[]; \
55 asm("start_" #name ": " code "; end_" #name ":")
56DEF_NATIVE(cli, "cli");
57DEF_NATIVE(sti, "sti");
58DEF_NATIVE(popf, "push %eax; popf");
59DEF_NATIVE(pushf, "pushf; pop %eax");
60DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli");
61DEF_NATIVE(iret, "iret");
62DEF_NATIVE(sti_sysexit, "sti; sysexit");
63
64static const struct native_insns
65{
66 const char *start, *end;
67} native_insns[] = {
68 [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
69 [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
70 [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
71 [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
72 [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
73 [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
74 [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
75};
76
77static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
78{
79 unsigned int insn_len;
80
81 /* Don't touch it if we don't have a replacement */
82 if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
83 return len;
84
85 insn_len = native_insns[type].end - native_insns[type].start;
86
87 /* Similarly if we can't fit replacement. */
88 if (len < insn_len)
89 return len;
90
91 memcpy(insns, native_insns[type].start, insn_len);
92 return insn_len;
93}
94
95static fastcall unsigned long native_get_debugreg(int regno)
96{
97 unsigned long val = 0; /* Damn you, gcc! */
98
99 switch (regno) {
100 case 0:
101 asm("movl %%db0, %0" :"=r" (val)); break;
102 case 1:
103 asm("movl %%db1, %0" :"=r" (val)); break;
104 case 2:
105 asm("movl %%db2, %0" :"=r" (val)); break;
106 case 3:
107 asm("movl %%db3, %0" :"=r" (val)); break;
108 case 6:
109 asm("movl %%db6, %0" :"=r" (val)); break;
110 case 7:
111 asm("movl %%db7, %0" :"=r" (val)); break;
112 default:
113 BUG();
114 }
115 return val;
116}
117
118static fastcall void native_set_debugreg(int regno, unsigned long value)
119{
120 switch (regno) {
121 case 0:
122 asm("movl %0,%%db0" : /* no output */ :"r" (value));
123 break;
124 case 1:
125 asm("movl %0,%%db1" : /* no output */ :"r" (value));
126 break;
127 case 2:
128 asm("movl %0,%%db2" : /* no output */ :"r" (value));
129 break;
130 case 3:
131 asm("movl %0,%%db3" : /* no output */ :"r" (value));
132 break;
133 case 6:
134 asm("movl %0,%%db6" : /* no output */ :"r" (value));
135 break;
136 case 7:
137 asm("movl %0,%%db7" : /* no output */ :"r" (value));
138 break;
139 default:
140 BUG();
141 }
142}
143
144void init_IRQ(void)
145{
146 paravirt_ops.init_IRQ();
147}
148
149static fastcall void native_clts(void)
150{
151 asm volatile ("clts");
152}
153
154static fastcall unsigned long native_read_cr0(void)
155{
156 unsigned long val;
157 asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
158 return val;
159}
160
161static fastcall void native_write_cr0(unsigned long val)
162{
163 asm volatile("movl %0,%%cr0": :"r" (val));
164}
165
166static fastcall unsigned long native_read_cr2(void)
167{
168 unsigned long val;
169 asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
170 return val;
171}
172
173static fastcall void native_write_cr2(unsigned long val)
174{
175 asm volatile("movl %0,%%cr2": :"r" (val));
176}
177
178static fastcall unsigned long native_read_cr3(void)
179{
180 unsigned long val;
181 asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
182 return val;
183}
184
185static fastcall void native_write_cr3(unsigned long val)
186{
187 asm volatile("movl %0,%%cr3": :"r" (val));
188}
189
190static fastcall unsigned long native_read_cr4(void)
191{
192 unsigned long val;
193 asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
194 return val;
195}
196
197static fastcall unsigned long native_read_cr4_safe(void)
198{
199 unsigned long val;
200 /* This could fault if %cr4 does not exist */
201 asm("1: movl %%cr4, %0 \n"
202 "2: \n"
203 ".section __ex_table,\"a\" \n"
204 ".long 1b,2b \n"
205 ".previous \n"
206 : "=r" (val): "0" (0));
207 return val;
208}
209
210static fastcall void native_write_cr4(unsigned long val)
211{
212 asm volatile("movl %0,%%cr4": :"r" (val));
213}
214
215static fastcall unsigned long native_save_fl(void)
216{
217 unsigned long f;
218 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
219 return f;
220}
221
222static fastcall void native_restore_fl(unsigned long f)
223{
224 asm volatile("pushl %0 ; popfl": /* no output */
225 :"g" (f)
226 :"memory", "cc");
227}
228
229static fastcall void native_irq_disable(void)
230{
231 asm volatile("cli": : :"memory");
232}
233
234static fastcall void native_irq_enable(void)
235{
236 asm volatile("sti": : :"memory");
237}
238
239static fastcall void native_safe_halt(void)
240{
241 asm volatile("sti; hlt": : :"memory");
242}
243
244static fastcall void native_halt(void)
245{
246 asm volatile("hlt": : :"memory");
247}
248
249static fastcall void native_wbinvd(void)
250{
251 asm volatile("wbinvd": : :"memory");
252}
253
254static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
255{
256 unsigned long long val;
257
258 asm volatile("2: rdmsr ; xorl %0,%0\n"
259 "1:\n\t"
260 ".section .fixup,\"ax\"\n\t"
261 "3: movl %3,%0 ; jmp 1b\n\t"
262 ".previous\n\t"
263 ".section __ex_table,\"a\"\n"
264 " .align 4\n\t"
265 " .long 2b,3b\n\t"
266 ".previous"
267 : "=r" (*err), "=A" (val)
268 : "c" (msr), "i" (-EFAULT));
269
270 return val;
271}
272
273static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
274{
275 int err;
276 asm volatile("2: wrmsr ; xorl %0,%0\n"
277 "1:\n\t"
278 ".section .fixup,\"ax\"\n\t"
279 "3: movl %4,%0 ; jmp 1b\n\t"
280 ".previous\n\t"
281 ".section __ex_table,\"a\"\n"
282 " .align 4\n\t"
283 " .long 2b,3b\n\t"
284 ".previous"
285 : "=a" (err)
286 : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
287 "i" (-EFAULT));
288 return err;
289}
290
291static fastcall unsigned long long native_read_tsc(void)
292{
293 unsigned long long val;
294 asm volatile("rdtsc" : "=A" (val));
295 return val;
296}
297
298static fastcall unsigned long long native_read_pmc(void)
299{
300 unsigned long long val;
301 asm volatile("rdpmc" : "=A" (val));
302 return val;
303}
304
305static fastcall void native_load_tr_desc(void)
306{
307 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
308}
309
310static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
311{
312 asm volatile("lgdt %0"::"m" (*dtr));
313}
314
315static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr)
316{
317 asm volatile("lidt %0"::"m" (*dtr));
318}
319
320static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
321{
322 asm ("sgdt %0":"=m" (*dtr));
323}
324
325static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
326{
327 asm ("sidt %0":"=m" (*dtr));
328}
329
330static fastcall unsigned long native_store_tr(void)
331{
332 unsigned long tr;
333 asm ("str %0":"=r" (tr));
334 return tr;
335}
336
337static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
338{
339#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
340 C(0); C(1); C(2);
341#undef C
342}
343
344static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
345{
346 u32 *lp = (u32 *)((char *)dt + entry*8);
347 lp[0] = entry_low;
348 lp[1] = entry_high;
349}
350
351static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
352{
353 native_write_dt_entry(dt, entrynum, low, high);
354}
355
356static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
357{
358 native_write_dt_entry(dt, entrynum, low, high);
359}
360
361static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
362{
363 native_write_dt_entry(dt, entrynum, low, high);
364}
365
366static fastcall void native_load_esp0(struct tss_struct *tss,
367 struct thread_struct *thread)
368{
369 tss->esp0 = thread->esp0;
370
371 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
372 if (unlikely(tss->ss1 != thread->sysenter_cs)) {
373 tss->ss1 = thread->sysenter_cs;
374 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
375 }
376}
377
378static fastcall void native_io_delay(void)
379{
380 asm volatile("outb %al,$0x80");
381}
382
383static fastcall void native_flush_tlb(void)
384{
385 __native_flush_tlb();
386}
387
388/*
389 * Global pages have to be flushed a bit differently. Not a real
390 * performance problem because this does not happen often.
391 */
392static fastcall void native_flush_tlb_global(void)
393{
394 __native_flush_tlb_global();
395}
396
397static fastcall void native_flush_tlb_single(u32 addr)
398{
399 __native_flush_tlb_single(addr);
400}
401
402#ifndef CONFIG_X86_PAE
403static fastcall void native_set_pte(pte_t *ptep, pte_t pteval)
404{
405 *ptep = pteval;
406}
407
408static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
409{
410 *ptep = pteval;
411}
412
413static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
414{
415 *pmdp = pmdval;
416}
417
418#else /* CONFIG_X86_PAE */
419
420static fastcall void native_set_pte(pte_t *ptep, pte_t pte)
421{
422 ptep->pte_high = pte.pte_high;
423 smp_wmb();
424 ptep->pte_low = pte.pte_low;
425}
426
427static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
428{
429 ptep->pte_high = pte.pte_high;
430 smp_wmb();
431 ptep->pte_low = pte.pte_low;
432}
433
434static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
435{
436 ptep->pte_low = 0;
437 smp_wmb();
438 ptep->pte_high = pte.pte_high;
439 smp_wmb();
440 ptep->pte_low = pte.pte_low;
441}
442
443static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
444{
445 set_64bit((unsigned long long *)ptep,pte_val(pteval));
446}
447
448static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
449{
450 set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
451}
452
453static fastcall void native_set_pud(pud_t *pudp, pud_t pudval)
454{
455 *pudp = pudval;
456}
457
458static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
459{
460 ptep->pte_low = 0;
461 smp_wmb();
462 ptep->pte_high = 0;
463}
464
465static fastcall void native_pmd_clear(pmd_t *pmd)
466{
467 u32 *tmp = (u32 *)pmd;
468 *tmp = 0;
469 smp_wmb();
470 *(tmp + 1) = 0;
471}
472#endif /* CONFIG_X86_PAE */
473
474/* These are in entry.S */
475extern fastcall void native_iret(void);
476extern fastcall void native_irq_enable_sysexit(void);
477
478static int __init print_banner(void)
479{
480 paravirt_ops.banner();
481 return 0;
482}
483core_initcall(print_banner);
484
485/* We simply declare start_kernel to be the paravirt probe of last resort. */
486paravirt_probe(start_kernel);
487
488struct paravirt_ops paravirt_ops = {
489 .name = "bare hardware",
490 .paravirt_enabled = 0,
491 .kernel_rpl = 0,
492
493 .patch = native_patch,
494 .banner = default_banner,
495 .arch_setup = native_nop,
496 .memory_setup = machine_specific_memory_setup,
497 .get_wallclock = native_get_wallclock,
498 .set_wallclock = native_set_wallclock,
499 .time_init = time_init_hook,
500 .init_IRQ = native_init_IRQ,
501
502 .cpuid = native_cpuid,
503 .get_debugreg = native_get_debugreg,
504 .set_debugreg = native_set_debugreg,
505 .clts = native_clts,
506 .read_cr0 = native_read_cr0,
507 .write_cr0 = native_write_cr0,
508 .read_cr2 = native_read_cr2,
509 .write_cr2 = native_write_cr2,
510 .read_cr3 = native_read_cr3,
511 .write_cr3 = native_write_cr3,
512 .read_cr4 = native_read_cr4,
513 .read_cr4_safe = native_read_cr4_safe,
514 .write_cr4 = native_write_cr4,
515 .save_fl = native_save_fl,
516 .restore_fl = native_restore_fl,
517 .irq_disable = native_irq_disable,
518 .irq_enable = native_irq_enable,
519 .safe_halt = native_safe_halt,
520 .halt = native_halt,
521 .wbinvd = native_wbinvd,
522 .read_msr = native_read_msr,
523 .write_msr = native_write_msr,
524 .read_tsc = native_read_tsc,
525 .read_pmc = native_read_pmc,
526 .load_tr_desc = native_load_tr_desc,
527 .set_ldt = native_set_ldt,
528 .load_gdt = native_load_gdt,
529 .load_idt = native_load_idt,
530 .store_gdt = native_store_gdt,
531 .store_idt = native_store_idt,
532 .store_tr = native_store_tr,
533 .load_tls = native_load_tls,
534 .write_ldt_entry = native_write_ldt_entry,
535 .write_gdt_entry = native_write_gdt_entry,
536 .write_idt_entry = native_write_idt_entry,
537 .load_esp0 = native_load_esp0,
538
539 .set_iopl_mask = native_set_iopl_mask,
540 .io_delay = native_io_delay,
541 .const_udelay = __const_udelay,
542
543#ifdef CONFIG_X86_LOCAL_APIC
544 .apic_write = native_apic_write,
545 .apic_write_atomic = native_apic_write_atomic,
546 .apic_read = native_apic_read,
547#endif
548
549 .flush_tlb_user = native_flush_tlb,
550 .flush_tlb_kernel = native_flush_tlb_global,
551 .flush_tlb_single = native_flush_tlb_single,
552
553 .set_pte = native_set_pte,
554 .set_pte_at = native_set_pte_at,
555 .set_pmd = native_set_pmd,
556 .pte_update = (void *)native_nop,
557 .pte_update_defer = (void *)native_nop,
558#ifdef CONFIG_X86_PAE
559 .set_pte_atomic = native_set_pte_atomic,
560 .set_pte_present = native_set_pte_present,
561 .set_pud = native_set_pud,
562 .pte_clear = native_pte_clear,
563 .pmd_clear = native_pmd_clear,
564#endif
565
566 .irq_enable_sysexit = native_irq_enable_sysexit,
567 .iret = native_iret,
568};
569EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c
index 25fe66853934..41af692c1584 100644
--- a/arch/i386/kernel/pci-dma.c
+++ b/arch/i386/kernel/pci-dma.c
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(dma_free_coherent);
75int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, 75int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
76 dma_addr_t device_addr, size_t size, int flags) 76 dma_addr_t device_addr, size_t size, int flags)
77{ 77{
78 void __iomem *mem_base; 78 void __iomem *mem_base = NULL;
79 int pages = size >> PAGE_SHIFT; 79 int pages = size >> PAGE_SHIFT;
80 int bitmap_size = (pages + 31)/32; 80 int bitmap_size = (pages + 31)/32;
81 81
@@ -92,14 +92,12 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
92 if (!mem_base) 92 if (!mem_base)
93 goto out; 93 goto out;
94 94
95 dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); 95 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
96 if (!dev->dma_mem) 96 if (!dev->dma_mem)
97 goto out; 97 goto out;
98 memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); 98 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
99 dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
100 if (!dev->dma_mem->bitmap) 99 if (!dev->dma_mem->bitmap)
101 goto free1_out; 100 goto free1_out;
102 memset(dev->dma_mem->bitmap, 0, bitmap_size);
103 101
104 dev->dma_mem->virt_base = mem_base; 102 dev->dma_mem->virt_base = mem_base;
105 dev->dma_mem->device_base = device_addr; 103 dev->dma_mem->device_base = device_addr;
@@ -114,6 +112,8 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
114 free1_out: 112 free1_out:
115 kfree(dev->dma_mem->bitmap); 113 kfree(dev->dma_mem->bitmap);
116 out: 114 out:
115 if (mem_base)
116 iounmap(mem_base);
117 return 0; 117 return 0;
118} 118}
119EXPORT_SYMBOL(dma_declare_coherent_memory); 119EXPORT_SYMBOL(dma_declare_coherent_memory);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 57d375900afb..99308510a17c 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -56,6 +56,7 @@
56 56
57#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
58#include <asm/cpu.h> 58#include <asm/cpu.h>
59#include <asm/pda.h>
59 60
60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
61 62
@@ -99,22 +100,18 @@ EXPORT_SYMBOL(enable_hlt);
99 */ 100 */
100void default_idle(void) 101void default_idle(void)
101{ 102{
102 local_irq_enable();
103
104 if (!hlt_counter && boot_cpu_data.hlt_works_ok) { 103 if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
105 current_thread_info()->status &= ~TS_POLLING; 104 current_thread_info()->status &= ~TS_POLLING;
106 smp_mb__after_clear_bit(); 105 smp_mb__after_clear_bit();
107 while (!need_resched()) { 106 local_irq_disable();
108 local_irq_disable(); 107 if (!need_resched())
109 if (!need_resched()) 108 safe_halt(); /* enables interrupts racelessly */
110 safe_halt(); 109 else
111 else 110 local_irq_enable();
112 local_irq_enable();
113 }
114 current_thread_info()->status |= TS_POLLING; 111 current_thread_info()->status |= TS_POLLING;
115 } else { 112 } else {
116 while (!need_resched()) 113 /* loop is done by the caller */
117 cpu_relax(); 114 cpu_relax();
118 } 115 }
119} 116}
120#ifdef CONFIG_APM_MODULE 117#ifdef CONFIG_APM_MODULE
@@ -128,14 +125,7 @@ EXPORT_SYMBOL(default_idle);
128 */ 125 */
129static void poll_idle (void) 126static void poll_idle (void)
130{ 127{
131 local_irq_enable(); 128 cpu_relax();
132
133 asm volatile(
134 "2:"
135 "testl %0, %1;"
136 "rep; nop;"
137 "je 2b;"
138 : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
139} 129}
140 130
141#ifdef CONFIG_HOTPLUG_CPU 131#ifdef CONFIG_HOTPLUG_CPU
@@ -205,7 +195,7 @@ void cpu_idle(void)
205void cpu_idle_wait(void) 195void cpu_idle_wait(void)
206{ 196{
207 unsigned int cpu, this_cpu = get_cpu(); 197 unsigned int cpu, this_cpu = get_cpu();
208 cpumask_t map; 198 cpumask_t map, tmp = current->cpus_allowed;
209 199
210 set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); 200 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
211 put_cpu(); 201 put_cpu();
@@ -227,6 +217,8 @@ void cpu_idle_wait(void)
227 } 217 }
228 cpus_and(map, map, cpu_online_map); 218 cpus_and(map, map, cpu_online_map);
229 } while (!cpus_empty(map)); 219 } while (!cpus_empty(map));
220
221 set_cpus_allowed(current, tmp);
230} 222}
231EXPORT_SYMBOL_GPL(cpu_idle_wait); 223EXPORT_SYMBOL_GPL(cpu_idle_wait);
232 224
@@ -254,8 +246,7 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
254static void mwait_idle(void) 246static void mwait_idle(void)
255{ 247{
256 local_irq_enable(); 248 local_irq_enable();
257 while (!need_resched()) 249 mwait_idle_with_hints(0, 0);
258 mwait_idle_with_hints(0, 0);
259} 250}
260 251
261void __devinit select_idle_routine(const struct cpuinfo_x86 *c) 252void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
@@ -312,8 +303,8 @@ void show_regs(struct pt_regs * regs)
312 regs->eax,regs->ebx,regs->ecx,regs->edx); 303 regs->eax,regs->ebx,regs->ecx,regs->edx);
313 printk("ESI: %08lx EDI: %08lx EBP: %08lx", 304 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
314 regs->esi, regs->edi, regs->ebp); 305 regs->esi, regs->edi, regs->ebp);
315 printk(" DS: %04x ES: %04x\n", 306 printk(" DS: %04x ES: %04x GS: %04x\n",
316 0xffff & regs->xds,0xffff & regs->xes); 307 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
317 308
318 cr0 = read_cr0(); 309 cr0 = read_cr0();
319 cr2 = read_cr2(); 310 cr2 = read_cr2();
@@ -336,7 +327,6 @@ extern void kernel_thread_helper(void);
336int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) 327int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
337{ 328{
338 struct pt_regs regs; 329 struct pt_regs regs;
339 int err;
340 330
341 memset(&regs, 0, sizeof(regs)); 331 memset(&regs, 0, sizeof(regs));
342 332
@@ -345,16 +335,14 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
345 335
346 regs.xds = __USER_DS; 336 regs.xds = __USER_DS;
347 regs.xes = __USER_DS; 337 regs.xes = __USER_DS;
338 regs.xgs = __KERNEL_PDA;
348 regs.orig_eax = -1; 339 regs.orig_eax = -1;
349 regs.eip = (unsigned long) kernel_thread_helper; 340 regs.eip = (unsigned long) kernel_thread_helper;
350 regs.xcs = __KERNEL_CS | get_kernel_rpl(); 341 regs.xcs = __KERNEL_CS | get_kernel_rpl();
351 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; 342 regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
352 343
353 /* Ok, create the new process.. */ 344 /* Ok, create the new process.. */
354 err = do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL); 345 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
355 if (err == 0) /* terminate kernel stack */
356 task_pt_regs(current)->eip = 0;
357 return err;
358} 346}
359EXPORT_SYMBOL(kernel_thread); 347EXPORT_SYMBOL(kernel_thread);
360 348
@@ -433,7 +421,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
433 p->thread.eip = (unsigned long) ret_from_fork; 421 p->thread.eip = (unsigned long) ret_from_fork;
434 422
435 savesegment(fs,p->thread.fs); 423 savesegment(fs,p->thread.fs);
436 savesegment(gs,p->thread.gs);
437 424
438 tsk = current; 425 tsk = current;
439 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 426 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -510,7 +497,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
510 dump->regs.ds = regs->xds; 497 dump->regs.ds = regs->xds;
511 dump->regs.es = regs->xes; 498 dump->regs.es = regs->xes;
512 savesegment(fs,dump->regs.fs); 499 savesegment(fs,dump->regs.fs);
513 savesegment(gs,dump->regs.gs); 500 dump->regs.gs = regs->xgs;
514 dump->regs.orig_eax = regs->orig_eax; 501 dump->regs.orig_eax = regs->orig_eax;
515 dump->regs.eip = regs->eip; 502 dump->regs.eip = regs->eip;
516 dump->regs.cs = regs->xcs; 503 dump->regs.cs = regs->xcs;
@@ -650,22 +637,27 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
650 637
651 __unlazy_fpu(prev_p); 638 __unlazy_fpu(prev_p);
652 639
640
641 /* we're going to use this soon, after a few expensive things */
642 if (next_p->fpu_counter > 5)
643 prefetch(&next->i387.fxsave);
644
653 /* 645 /*
654 * Reload esp0. 646 * Reload esp0.
655 */ 647 */
656 load_esp0(tss, next); 648 load_esp0(tss, next);
657 649
658 /* 650 /*
659 * Save away %fs and %gs. No need to save %es and %ds, as 651 * Save away %fs. No need to save %gs, as it was saved on the
660 * those are always kernel segments while inside the kernel. 652 * stack on entry. No need to save %es and %ds, as those are
661 * Doing this before setting the new TLS descriptors avoids 653 * always kernel segments while inside the kernel. Doing this
662 * the situation where we temporarily have non-reloadable 654 * before setting the new TLS descriptors avoids the situation
663 * segments in %fs and %gs. This could be an issue if the 655 * where we temporarily have non-reloadable segments in %fs
664 * NMI handler ever used %fs or %gs (it does not today), or 656 * and %gs. This could be an issue if the NMI handler ever
665 * if the kernel is running inside of a hypervisor layer. 657 * used %fs or %gs (it does not today), or if the kernel is
658 * running inside of a hypervisor layer.
666 */ 659 */
667 savesegment(fs, prev->fs); 660 savesegment(fs, prev->fs);
668 savesegment(gs, prev->gs);
669 661
670 /* 662 /*
671 * Load the per-thread Thread-Local Storage descriptor. 663 * Load the per-thread Thread-Local Storage descriptor.
@@ -673,22 +665,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
673 load_TLS(next, cpu); 665 load_TLS(next, cpu);
674 666
675 /* 667 /*
676 * Restore %fs and %gs if needed. 668 * Restore %fs if needed.
677 * 669 *
678 * Glibc normally makes %fs be zero, and %gs is one of 670 * Glibc normally makes %fs be zero.
679 * the TLS segments.
680 */ 671 */
681 if (unlikely(prev->fs | next->fs)) 672 if (unlikely(prev->fs | next->fs))
682 loadsegment(fs, next->fs); 673 loadsegment(fs, next->fs);
683 674
684 if (prev->gs | next->gs) 675 write_pda(pcurrent, next_p);
685 loadsegment(gs, next->gs);
686
687 /*
688 * Restore IOPL if needed.
689 */
690 if (unlikely(prev->iopl != next->iopl))
691 set_iopl_mask(next->iopl);
692 676
693 /* 677 /*
694 * Now maybe handle debug registers and/or IO bitmaps 678 * Now maybe handle debug registers and/or IO bitmaps
@@ -699,6 +683,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
699 683
700 disable_tsc(prev_p, next_p); 684 disable_tsc(prev_p, next_p);
701 685
686 /* If the task has used fpu the last 5 timeslices, just do a full
687 * restore of the math state immediately to avoid the trap; the
688 * chances of needing FPU soon are obviously high now
689 */
690 if (next_p->fpu_counter > 5)
691 math_state_restore();
692
702 return prev_p; 693 return prev_p;
703} 694}
704 695
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 775f50e9395b..f3f94ac5736a 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -94,13 +94,9 @@ static int putreg(struct task_struct *child,
94 return -EIO; 94 return -EIO;
95 child->thread.fs = value; 95 child->thread.fs = value;
96 return 0; 96 return 0;
97 case GS:
98 if (value && (value & 3) != 3)
99 return -EIO;
100 child->thread.gs = value;
101 return 0;
102 case DS: 97 case DS:
103 case ES: 98 case ES:
99 case GS:
104 if (value && (value & 3) != 3) 100 if (value && (value & 3) != 3)
105 return -EIO; 101 return -EIO;
106 value &= 0xffff; 102 value &= 0xffff;
@@ -116,8 +112,8 @@ static int putreg(struct task_struct *child,
116 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; 112 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
117 break; 113 break;
118 } 114 }
119 if (regno > GS*4) 115 if (regno > ES*4)
120 regno -= 2*4; 116 regno -= 1*4;
121 put_stack_long(child, regno - sizeof(struct pt_regs), value); 117 put_stack_long(child, regno - sizeof(struct pt_regs), value);
122 return 0; 118 return 0;
123} 119}
@@ -131,18 +127,16 @@ static unsigned long getreg(struct task_struct *child,
131 case FS: 127 case FS:
132 retval = child->thread.fs; 128 retval = child->thread.fs;
133 break; 129 break;
134 case GS:
135 retval = child->thread.gs;
136 break;
137 case DS: 130 case DS:
138 case ES: 131 case ES:
132 case GS:
139 case SS: 133 case SS:
140 case CS: 134 case CS:
141 retval = 0xffff; 135 retval = 0xffff;
142 /* fall through */ 136 /* fall through */
143 default: 137 default:
144 if (regno > GS*4) 138 if (regno > ES*4)
145 regno -= 2*4; 139 regno -= 1*4;
146 regno = regno - sizeof(struct pt_regs); 140 regno = regno - sizeof(struct pt_regs);
147 retval &= get_stack_long(child, regno); 141 retval &= get_stack_long(child, regno);
148 } 142 }
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index 9f6ab1789bb0..34874c398b44 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -3,10 +3,12 @@
3 */ 3 */
4#include <linux/pci.h> 4#include <linux/pci.h>
5#include <linux/irq.h> 5#include <linux/irq.h>
6#include <asm/pci-direct.h>
7#include <asm/genapic.h>
8#include <asm/cpu.h>
6 9
7#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) 10#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
8 11static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
9static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
10{ 12{
11 u8 config, rev; 13 u8 config, rev;
12 u32 word; 14 u32 word;
@@ -14,14 +16,12 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
14 /* BIOS may enable hardware IRQ balancing for 16 /* BIOS may enable hardware IRQ balancing for
15 * E7520/E7320/E7525(revision ID 0x9 and below) 17 * E7520/E7320/E7525(revision ID 0x9 and below)
16 * based platforms. 18 * based platforms.
17 * Disable SW irqbalance/affinity on those platforms. 19 * For those platforms, make sure that the genapic is set to 'flat'
18 */ 20 */
19 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 21 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
20 if (rev > 0x9) 22 if (rev > 0x9)
21 return; 23 return;
22 24
23 printk(KERN_INFO "Intel E7520/7320/7525 detected.");
24
25 /* enable access to config space*/ 25 /* enable access to config space*/
26 pci_read_config_byte(dev, 0xf4, &config); 26 pci_read_config_byte(dev, 0xf4, &config);
27 pci_write_config_byte(dev, 0xf4, config|0x2); 27 pci_write_config_byte(dev, 0xf4, config|0x2);
@@ -30,6 +30,44 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); 30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
31 31
32 if (!(word & (1 << 13))) { 32 if (!(word & (1 << 13))) {
33#ifdef CONFIG_X86_64
34 if (genapic != &apic_flat)
35 panic("APIC mode must be flat on this system\n");
36#elif defined(CONFIG_X86_GENERICARCH)
37 if (genapic != &apic_default)
38 panic("APIC mode must be default(flat) on this system. Use apic=default\n");
39#endif
40 }
41
42 /* put back the original value for config space*/
43 if (!(config & 0x2))
44 pci_write_config_byte(dev, 0xf4, config);
45}
46
47void __init quirk_intel_irqbalance(void)
48{
49 u8 config, rev;
50 u32 word;
51
52 /* BIOS may enable hardware IRQ balancing for
53 * E7520/E7320/E7525(revision ID 0x9 and below)
54 * based platforms.
55 * Disable SW irqbalance/affinity on those platforms.
56 */
57 rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
58 if (rev > 0x9)
59 return;
60
61 printk(KERN_INFO "Intel E7520/7320/7525 detected.");
62
63 /* enable access to config space */
64 config = read_pci_config_byte(0, 0, 0, 0xf4);
65 write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
66
67 /* read xTPR register */
68 word = read_pci_config_16(0, 0, 0x40, 0x4c);
69
70 if (!(word & (1 << 13))) {
33 printk(KERN_INFO "Disabling irq balancing and affinity\n"); 71 printk(KERN_INFO "Disabling irq balancing and affinity\n");
34#ifdef CONFIG_IRQBALANCE 72#ifdef CONFIG_IRQBALANCE
35 irqbalance_disable(""); 73 irqbalance_disable("");
@@ -38,13 +76,24 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
38#ifdef CONFIG_PROC_FS 76#ifdef CONFIG_PROC_FS
39 no_irq_affinity = 1; 77 no_irq_affinity = 1;
40#endif 78#endif
79#ifdef CONFIG_HOTPLUG_CPU
80 printk(KERN_INFO "Disabling cpu hotplug control\n");
81 enable_cpu_hotplug = 0;
82#endif
83#ifdef CONFIG_X86_64
84 /* force the genapic selection to flat mode so that
85 * interrupts can be redirected to more than one CPU.
86 */
87 genapic_force = &apic_flat;
88#endif
41 } 89 }
42 90
43 /* put back the original value for config space*/ 91 /* put back the original value for config space */
44 if (!(config & 0x2)) 92 if (!(config & 0x2))
45 pci_write_config_byte(dev, 0xf4, config); 93 write_pci_config_byte(0, 0, 0, 0xf4, config);
46} 94}
47DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); 95DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
48DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); 96DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
49DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); 97DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
98
50#endif 99#endif
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 84278e0093a2..3514b4153f7f 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -12,6 +12,7 @@
12#include <linux/dmi.h> 12#include <linux/dmi.h>
13#include <linux/ctype.h> 13#include <linux/ctype.h>
14#include <linux/pm.h> 14#include <linux/pm.h>
15#include <linux/reboot.h>
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16#include <asm/apic.h> 17#include <asm/apic.h>
17#include <asm/desc.h> 18#include <asm/desc.h>
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 519e63c3c130..79df6e612dbd 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -63,9 +63,6 @@
63#include <setup_arch.h> 63#include <setup_arch.h>
64#include <bios_ebda.h> 64#include <bios_ebda.h>
65 65
66/* Forward Declaration. */
67void __init find_max_pfn(void);
68
69/* This value is set up by the early boot code to point to the value 66/* This value is set up by the early boot code to point to the value
70 immediately after the boot time page tables. It contains a *physical* 67 immediately after the boot time page tables. It contains a *physical*
71 address, and must not be in the .bss segment! */ 68 address, and must not be in the .bss segment! */
@@ -76,11 +73,8 @@ int disable_pse __devinitdata = 0;
76/* 73/*
77 * Machine setup.. 74 * Machine setup..
78 */ 75 */
79 76extern struct resource code_resource;
80#ifdef CONFIG_EFI 77extern struct resource data_resource;
81int efi_enabled = 0;
82EXPORT_SYMBOL(efi_enabled);
83#endif
84 78
85/* cpu data as detected by the assembly code in head.S */ 79/* cpu data as detected by the assembly code in head.S */
86struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; 80struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
@@ -99,12 +93,6 @@ unsigned int machine_submodel_id;
99unsigned int BIOS_revision; 93unsigned int BIOS_revision;
100unsigned int mca_pentium_flag; 94unsigned int mca_pentium_flag;
101 95
102/* For PCI or other memory-mapped resources */
103unsigned long pci_mem_start = 0x10000000;
104#ifdef CONFIG_PCI
105EXPORT_SYMBOL(pci_mem_start);
106#endif
107
108/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 96/* Boot loader ID as an integer, for the benefit of proc_dointvec */
109int bootloader_type; 97int bootloader_type;
110 98
@@ -134,7 +122,6 @@ struct ist_info ist_info;
134 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) 122 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
135EXPORT_SYMBOL(ist_info); 123EXPORT_SYMBOL(ist_info);
136#endif 124#endif
137struct e820map e820;
138 125
139extern void early_cpu_init(void); 126extern void early_cpu_init(void);
140extern int root_mountflags; 127extern int root_mountflags;
@@ -149,516 +136,6 @@ static char command_line[COMMAND_LINE_SIZE];
149 136
150unsigned char __initdata boot_params[PARAM_SIZE]; 137unsigned char __initdata boot_params[PARAM_SIZE];
151 138
152static struct resource data_resource = {
153 .name = "Kernel data",
154 .start = 0,
155 .end = 0,
156 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
157};
158
159static struct resource code_resource = {
160 .name = "Kernel code",
161 .start = 0,
162 .end = 0,
163 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
164};
165
166static struct resource system_rom_resource = {
167 .name = "System ROM",
168 .start = 0xf0000,
169 .end = 0xfffff,
170 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
171};
172
173static struct resource extension_rom_resource = {
174 .name = "Extension ROM",
175 .start = 0xe0000,
176 .end = 0xeffff,
177 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
178};
179
180static struct resource adapter_rom_resources[] = { {
181 .name = "Adapter ROM",
182 .start = 0xc8000,
183 .end = 0,
184 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
185}, {
186 .name = "Adapter ROM",
187 .start = 0,
188 .end = 0,
189 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
190}, {
191 .name = "Adapter ROM",
192 .start = 0,
193 .end = 0,
194 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
195}, {
196 .name = "Adapter ROM",
197 .start = 0,
198 .end = 0,
199 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
200}, {
201 .name = "Adapter ROM",
202 .start = 0,
203 .end = 0,
204 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
205}, {
206 .name = "Adapter ROM",
207 .start = 0,
208 .end = 0,
209 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
210} };
211
212static struct resource video_rom_resource = {
213 .name = "Video ROM",
214 .start = 0xc0000,
215 .end = 0xc7fff,
216 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
217};
218
219static struct resource video_ram_resource = {
220 .name = "Video RAM area",
221 .start = 0xa0000,
222 .end = 0xbffff,
223 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
224};
225
226static struct resource standard_io_resources[] = { {
227 .name = "dma1",
228 .start = 0x0000,
229 .end = 0x001f,
230 .flags = IORESOURCE_BUSY | IORESOURCE_IO
231}, {
232 .name = "pic1",
233 .start = 0x0020,
234 .end = 0x0021,
235 .flags = IORESOURCE_BUSY | IORESOURCE_IO
236}, {
237 .name = "timer0",
238 .start = 0x0040,
239 .end = 0x0043,
240 .flags = IORESOURCE_BUSY | IORESOURCE_IO
241}, {
242 .name = "timer1",
243 .start = 0x0050,
244 .end = 0x0053,
245 .flags = IORESOURCE_BUSY | IORESOURCE_IO
246}, {
247 .name = "keyboard",
248 .start = 0x0060,
249 .end = 0x006f,
250 .flags = IORESOURCE_BUSY | IORESOURCE_IO
251}, {
252 .name = "dma page reg",
253 .start = 0x0080,
254 .end = 0x008f,
255 .flags = IORESOURCE_BUSY | IORESOURCE_IO
256}, {
257 .name = "pic2",
258 .start = 0x00a0,
259 .end = 0x00a1,
260 .flags = IORESOURCE_BUSY | IORESOURCE_IO
261}, {
262 .name = "dma2",
263 .start = 0x00c0,
264 .end = 0x00df,
265 .flags = IORESOURCE_BUSY | IORESOURCE_IO
266}, {
267 .name = "fpu",
268 .start = 0x00f0,
269 .end = 0x00ff,
270 .flags = IORESOURCE_BUSY | IORESOURCE_IO
271} };
272
273#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
274
275static int __init romchecksum(unsigned char *rom, unsigned long length)
276{
277 unsigned char *p, sum = 0;
278
279 for (p = rom; p < rom + length; p++)
280 sum += *p;
281 return sum == 0;
282}
283
284static void __init probe_roms(void)
285{
286 unsigned long start, length, upper;
287 unsigned char *rom;
288 int i;
289
290 /* video rom */
291 upper = adapter_rom_resources[0].start;
292 for (start = video_rom_resource.start; start < upper; start += 2048) {
293 rom = isa_bus_to_virt(start);
294 if (!romsignature(rom))
295 continue;
296
297 video_rom_resource.start = start;
298
299 /* 0 < length <= 0x7f * 512, historically */
300 length = rom[2] * 512;
301
302 /* if checksum okay, trust length byte */
303 if (length && romchecksum(rom, length))
304 video_rom_resource.end = start + length - 1;
305
306 request_resource(&iomem_resource, &video_rom_resource);
307 break;
308 }
309
310 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
311 if (start < upper)
312 start = upper;
313
314 /* system rom */
315 request_resource(&iomem_resource, &system_rom_resource);
316 upper = system_rom_resource.start;
317
318 /* check for extension rom (ignore length byte!) */
319 rom = isa_bus_to_virt(extension_rom_resource.start);
320 if (romsignature(rom)) {
321 length = extension_rom_resource.end - extension_rom_resource.start + 1;
322 if (romchecksum(rom, length)) {
323 request_resource(&iomem_resource, &extension_rom_resource);
324 upper = extension_rom_resource.start;
325 }
326 }
327
328 /* check for adapter roms on 2k boundaries */
329 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
330 rom = isa_bus_to_virt(start);
331 if (!romsignature(rom))
332 continue;
333
334 /* 0 < length <= 0x7f * 512, historically */
335 length = rom[2] * 512;
336
337 /* but accept any length that fits if checksum okay */
338 if (!length || start + length > upper || !romchecksum(rom, length))
339 continue;
340
341 adapter_rom_resources[i].start = start;
342 adapter_rom_resources[i].end = start + length - 1;
343 request_resource(&iomem_resource, &adapter_rom_resources[i]);
344
345 start = adapter_rom_resources[i++].end & ~2047UL;
346 }
347}
348
349static void __init limit_regions(unsigned long long size)
350{
351 unsigned long long current_addr = 0;
352 int i;
353
354 if (efi_enabled) {
355 efi_memory_desc_t *md;
356 void *p;
357
358 for (p = memmap.map, i = 0; p < memmap.map_end;
359 p += memmap.desc_size, i++) {
360 md = p;
361 current_addr = md->phys_addr + (md->num_pages << 12);
362 if (md->type == EFI_CONVENTIONAL_MEMORY) {
363 if (current_addr >= size) {
364 md->num_pages -=
365 (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
366 memmap.nr_map = i + 1;
367 return;
368 }
369 }
370 }
371 }
372 for (i = 0; i < e820.nr_map; i++) {
373 current_addr = e820.map[i].addr + e820.map[i].size;
374 if (current_addr < size)
375 continue;
376
377 if (e820.map[i].type != E820_RAM)
378 continue;
379
380 if (e820.map[i].addr >= size) {
381 /*
382 * This region starts past the end of the
383 * requested size, skip it completely.
384 */
385 e820.nr_map = i;
386 } else {
387 e820.nr_map = i + 1;
388 e820.map[i].size -= current_addr - size;
389 }
390 return;
391 }
392}
393
394void __init add_memory_region(unsigned long long start,
395 unsigned long long size, int type)
396{
397 int x;
398
399 if (!efi_enabled) {
400 x = e820.nr_map;
401
402 if (x == E820MAX) {
403 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
404 return;
405 }
406
407 e820.map[x].addr = start;
408 e820.map[x].size = size;
409 e820.map[x].type = type;
410 e820.nr_map++;
411 }
412} /* add_memory_region */
413
414#define E820_DEBUG 1
415
416static void __init print_memory_map(char *who)
417{
418 int i;
419
420 for (i = 0; i < e820.nr_map; i++) {
421 printk(" %s: %016Lx - %016Lx ", who,
422 e820.map[i].addr,
423 e820.map[i].addr + e820.map[i].size);
424 switch (e820.map[i].type) {
425 case E820_RAM: printk("(usable)\n");
426 break;
427 case E820_RESERVED:
428 printk("(reserved)\n");
429 break;
430 case E820_ACPI:
431 printk("(ACPI data)\n");
432 break;
433 case E820_NVS:
434 printk("(ACPI NVS)\n");
435 break;
436 default: printk("type %lu\n", e820.map[i].type);
437 break;
438 }
439 }
440}
441
442/*
443 * Sanitize the BIOS e820 map.
444 *
445 * Some e820 responses include overlapping entries. The following
446 * replaces the original e820 map with a new one, removing overlaps.
447 *
448 */
449struct change_member {
450 struct e820entry *pbios; /* pointer to original bios entry */
451 unsigned long long addr; /* address for this change point */
452};
453static struct change_member change_point_list[2*E820MAX] __initdata;
454static struct change_member *change_point[2*E820MAX] __initdata;
455static struct e820entry *overlap_list[E820MAX] __initdata;
456static struct e820entry new_bios[E820MAX] __initdata;
457
458int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
459{
460 struct change_member *change_tmp;
461 unsigned long current_type, last_type;
462 unsigned long long last_addr;
463 int chgidx, still_changing;
464 int overlap_entries;
465 int new_bios_entry;
466 int old_nr, new_nr, chg_nr;
467 int i;
468
469 /*
470 Visually we're performing the following (1,2,3,4 = memory types)...
471
472 Sample memory map (w/overlaps):
473 ____22__________________
474 ______________________4_
475 ____1111________________
476 _44_____________________
477 11111111________________
478 ____________________33__
479 ___________44___________
480 __________33333_________
481 ______________22________
482 ___________________2222_
483 _________111111111______
484 _____________________11_
485 _________________4______
486
487 Sanitized equivalent (no overlap):
488 1_______________________
489 _44_____________________
490 ___1____________________
491 ____22__________________
492 ______11________________
493 _________1______________
494 __________3_____________
495 ___________44___________
496 _____________33_________
497 _______________2________
498 ________________1_______
499 _________________4______
500 ___________________2____
501 ____________________33__
502 ______________________4_
503 */
504
505 /* if there's only one memory region, don't bother */
506 if (*pnr_map < 2)
507 return -1;
508
509 old_nr = *pnr_map;
510
511 /* bail out if we find any unreasonable addresses in bios map */
512 for (i=0; i<old_nr; i++)
513 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
514 return -1;
515
516 /* create pointers for initial change-point information (for sorting) */
517 for (i=0; i < 2*old_nr; i++)
518 change_point[i] = &change_point_list[i];
519
520 /* record all known change-points (starting and ending addresses),
521 omitting those that are for empty memory regions */
522 chgidx = 0;
523 for (i=0; i < old_nr; i++) {
524 if (biosmap[i].size != 0) {
525 change_point[chgidx]->addr = biosmap[i].addr;
526 change_point[chgidx++]->pbios = &biosmap[i];
527 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
528 change_point[chgidx++]->pbios = &biosmap[i];
529 }
530 }
531 chg_nr = chgidx; /* true number of change-points */
532
533 /* sort change-point list by memory addresses (low -> high) */
534 still_changing = 1;
535 while (still_changing) {
536 still_changing = 0;
537 for (i=1; i < chg_nr; i++) {
538 /* if <current_addr> > <last_addr>, swap */
539 /* or, if current=<start_addr> & last=<end_addr>, swap */
540 if ((change_point[i]->addr < change_point[i-1]->addr) ||
541 ((change_point[i]->addr == change_point[i-1]->addr) &&
542 (change_point[i]->addr == change_point[i]->pbios->addr) &&
543 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
544 )
545 {
546 change_tmp = change_point[i];
547 change_point[i] = change_point[i-1];
548 change_point[i-1] = change_tmp;
549 still_changing=1;
550 }
551 }
552 }
553
554 /* create a new bios memory map, removing overlaps */
555 overlap_entries=0; /* number of entries in the overlap table */
556 new_bios_entry=0; /* index for creating new bios map entries */
557 last_type = 0; /* start with undefined memory type */
558 last_addr = 0; /* start with 0 as last starting address */
559 /* loop through change-points, determining affect on the new bios map */
560 for (chgidx=0; chgidx < chg_nr; chgidx++)
561 {
562 /* keep track of all overlapping bios entries */
563 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
564 {
565 /* add map entry to overlap list (> 1 entry implies an overlap) */
566 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
567 }
568 else
569 {
570 /* remove entry from list (order independent, so swap with last) */
571 for (i=0; i<overlap_entries; i++)
572 {
573 if (overlap_list[i] == change_point[chgidx]->pbios)
574 overlap_list[i] = overlap_list[overlap_entries-1];
575 }
576 overlap_entries--;
577 }
578 /* if there are overlapping entries, decide which "type" to use */
579 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
580 current_type = 0;
581 for (i=0; i<overlap_entries; i++)
582 if (overlap_list[i]->type > current_type)
583 current_type = overlap_list[i]->type;
584 /* continue building up new bios map based on this information */
585 if (current_type != last_type) {
586 if (last_type != 0) {
587 new_bios[new_bios_entry].size =
588 change_point[chgidx]->addr - last_addr;
589 /* move forward only if the new size was non-zero */
590 if (new_bios[new_bios_entry].size != 0)
591 if (++new_bios_entry >= E820MAX)
592 break; /* no more space left for new bios entries */
593 }
594 if (current_type != 0) {
595 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
596 new_bios[new_bios_entry].type = current_type;
597 last_addr=change_point[chgidx]->addr;
598 }
599 last_type = current_type;
600 }
601 }
602 new_nr = new_bios_entry; /* retain count for new bios entries */
603
604 /* copy new bios mapping into original location */
605 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
606 *pnr_map = new_nr;
607
608 return 0;
609}
610
611/*
612 * Copy the BIOS e820 map into a safe place.
613 *
614 * Sanity-check it while we're at it..
615 *
616 * If we're lucky and live on a modern system, the setup code
617 * will have given us a memory map that we can use to properly
618 * set up memory. If we aren't, we'll fake a memory map.
619 *
620 * We check to see that the memory map contains at least 2 elements
621 * before we'll use it, because the detection code in setup.S may
622 * not be perfect and most every PC known to man has two memory
623 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
624 * thinkpad 560x, for example, does not cooperate with the memory
625 * detection code.)
626 */
627int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
628{
629 /* Only one memory region (or negative)? Ignore it */
630 if (nr_map < 2)
631 return -1;
632
633 do {
634 unsigned long long start = biosmap->addr;
635 unsigned long long size = biosmap->size;
636 unsigned long long end = start + size;
637 unsigned long type = biosmap->type;
638
639 /* Overflow in 64 bits? Ignore the memory map. */
640 if (start > end)
641 return -1;
642
643 /*
644 * Some BIOSes claim RAM in the 640k - 1M region.
645 * Not right. Fix it up.
646 */
647 if (type == E820_RAM) {
648 if (start < 0x100000ULL && end > 0xA0000ULL) {
649 if (start < 0xA0000ULL)
650 add_memory_region(start, 0xA0000ULL-start, type);
651 if (end <= 0x100000ULL)
652 continue;
653 start = 0x100000ULL;
654 size = end - start;
655 }
656 }
657 add_memory_region(start, size, type);
658 } while (biosmap++,--nr_map);
659 return 0;
660}
661
662#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 139#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
663struct edd edd; 140struct edd edd;
664#ifdef CONFIG_EDD_MODULE 141#ifdef CONFIG_EDD_MODULE
@@ -682,7 +159,7 @@ static inline void copy_edd(void)
682} 159}
683#endif 160#endif
684 161
685static int __initdata user_defined_memmap = 0; 162int __initdata user_defined_memmap = 0;
686 163
687/* 164/*
688 * "mem=nopentium" disables the 4MB page tables. 165 * "mem=nopentium" disables the 4MB page tables.
@@ -719,51 +196,6 @@ static int __init parse_mem(char *arg)
719} 196}
720early_param("mem", parse_mem); 197early_param("mem", parse_mem);
721 198
722static int __init parse_memmap(char *arg)
723{
724 if (!arg)
725 return -EINVAL;
726
727 if (strcmp(arg, "exactmap") == 0) {
728#ifdef CONFIG_CRASH_DUMP
729 /* If we are doing a crash dump, we
730 * still need to know the real mem
731 * size before original memory map is
732 * reset.
733 */
734 find_max_pfn();
735 saved_max_pfn = max_pfn;
736#endif
737 e820.nr_map = 0;
738 user_defined_memmap = 1;
739 } else {
740 /* If the user specifies memory size, we
741 * limit the BIOS-provided memory map to
742 * that size. exactmap can be used to specify
743 * the exact map. mem=number can be used to
744 * trim the existing memory map.
745 */
746 unsigned long long start_at, mem_size;
747
748 mem_size = memparse(arg, &arg);
749 if (*arg == '@') {
750 start_at = memparse(arg+1, &arg);
751 add_memory_region(start_at, mem_size, E820_RAM);
752 } else if (*arg == '#') {
753 start_at = memparse(arg+1, &arg);
754 add_memory_region(start_at, mem_size, E820_ACPI);
755 } else if (*arg == '$') {
756 start_at = memparse(arg+1, &arg);
757 add_memory_region(start_at, mem_size, E820_RESERVED);
758 } else {
759 limit_regions(mem_size);
760 user_defined_memmap = 1;
761 }
762 }
763 return 0;
764}
765early_param("memmap", parse_memmap);
766
767#ifdef CONFIG_PROC_VMCORE 199#ifdef CONFIG_PROC_VMCORE
768/* elfcorehdr= specifies the location of elf core header 200/* elfcorehdr= specifies the location of elf core header
769 * stored by the crashed kernel. 201 * stored by the crashed kernel.
@@ -828,90 +260,6 @@ static int __init parse_reservetop(char *arg)
828early_param("reservetop", parse_reservetop); 260early_param("reservetop", parse_reservetop);
829 261
830/* 262/*
831 * Callback for efi_memory_walk.
832 */
833static int __init
834efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
835{
836 unsigned long *max_pfn = arg, pfn;
837
838 if (start < end) {
839 pfn = PFN_UP(end -1);
840 if (pfn > *max_pfn)
841 *max_pfn = pfn;
842 }
843 return 0;
844}
845
846static int __init
847efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
848{
849 memory_present(0, start, end);
850 return 0;
851}
852
853 /*
854 * This function checks if the entire range <start,end> is mapped with type.
855 *
856 * Note: this function only works correct if the e820 table is sorted and
857 * not-overlapping, which is the case
858 */
859int __init
860e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
861{
862 u64 start = s;
863 u64 end = e;
864 int i;
865 for (i = 0; i < e820.nr_map; i++) {
866 struct e820entry *ei = &e820.map[i];
867 if (type && ei->type != type)
868 continue;
869 /* is the region (part) in overlap with the current region ?*/
870 if (ei->addr >= end || ei->addr + ei->size <= start)
871 continue;
872 /* if the region is at the beginning of <start,end> we move
873 * start to the end of the region since it's ok until there
874 */
875 if (ei->addr <= start)
876 start = ei->addr + ei->size;
877 /* if start is now at or beyond end, we're done, full
878 * coverage */
879 if (start >= end)
880 return 1; /* we're done */
881 }
882 return 0;
883}
884
885/*
886 * Find the highest page frame number we have available
887 */
888void __init find_max_pfn(void)
889{
890 int i;
891
892 max_pfn = 0;
893 if (efi_enabled) {
894 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
895 efi_memmap_walk(efi_memory_present_wrapper, NULL);
896 return;
897 }
898
899 for (i = 0; i < e820.nr_map; i++) {
900 unsigned long start, end;
901 /* RAM? */
902 if (e820.map[i].type != E820_RAM)
903 continue;
904 start = PFN_UP(e820.map[i].addr);
905 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
906 if (start >= end)
907 continue;
908 if (end > max_pfn)
909 max_pfn = end;
910 memory_present(0, start, end);
911 }
912}
913
914/*
915 * Determine low and high memory ranges: 263 * Determine low and high memory ranges:
916 */ 264 */
917unsigned long __init find_max_low_pfn(void) 265unsigned long __init find_max_low_pfn(void)
@@ -971,68 +319,6 @@ unsigned long __init find_max_low_pfn(void)
971} 319}
972 320
973/* 321/*
974 * Free all available memory for boot time allocation. Used
975 * as a callback function by efi_memory_walk()
976 */
977
978static int __init
979free_available_memory(unsigned long start, unsigned long end, void *arg)
980{
981 /* check max_low_pfn */
982 if (start >= (max_low_pfn << PAGE_SHIFT))
983 return 0;
984 if (end >= (max_low_pfn << PAGE_SHIFT))
985 end = max_low_pfn << PAGE_SHIFT;
986 if (start < end)
987 free_bootmem(start, end - start);
988
989 return 0;
990}
991/*
992 * Register fully available low RAM pages with the bootmem allocator.
993 */
994static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
995{
996 int i;
997
998 if (efi_enabled) {
999 efi_memmap_walk(free_available_memory, NULL);
1000 return;
1001 }
1002 for (i = 0; i < e820.nr_map; i++) {
1003 unsigned long curr_pfn, last_pfn, size;
1004 /*
1005 * Reserve usable low memory
1006 */
1007 if (e820.map[i].type != E820_RAM)
1008 continue;
1009 /*
1010 * We are rounding up the start address of usable memory:
1011 */
1012 curr_pfn = PFN_UP(e820.map[i].addr);
1013 if (curr_pfn >= max_low_pfn)
1014 continue;
1015 /*
1016 * ... and at the end of the usable range downwards:
1017 */
1018 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
1019
1020 if (last_pfn > max_low_pfn)
1021 last_pfn = max_low_pfn;
1022
1023 /*
1024 * .. finally, did all the rounding and playing
1025 * around just make the area go away?
1026 */
1027 if (last_pfn <= curr_pfn)
1028 continue;
1029
1030 size = last_pfn - curr_pfn;
1031 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
1032 }
1033}
1034
1035/*
1036 * workaround for Dell systems that neglect to reserve EBDA 322 * workaround for Dell systems that neglect to reserve EBDA
1037 */ 323 */
1038static void __init reserve_ebda_region(void) 324static void __init reserve_ebda_region(void)
@@ -1118,8 +404,8 @@ void __init setup_bootmem_allocator(void)
1118 * the (very unlikely) case of us accidentally initializing the 404 * the (very unlikely) case of us accidentally initializing the
1119 * bootmem allocator with an invalid RAM area. 405 * bootmem allocator with an invalid RAM area.
1120 */ 406 */
1121 reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) + 407 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
1122 bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START)); 408 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
1123 409
1124 /* 410 /*
1125 * reserve physical page 0 - it's a special BIOS page on many boxes, 411 * reserve physical page 0 - it's a special BIOS page on many boxes,
@@ -1162,8 +448,7 @@ void __init setup_bootmem_allocator(void)
1162 if (LOADER_TYPE && INITRD_START) { 448 if (LOADER_TYPE && INITRD_START) {
1163 if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { 449 if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
1164 reserve_bootmem(INITRD_START, INITRD_SIZE); 450 reserve_bootmem(INITRD_START, INITRD_SIZE);
1165 initrd_start = 451 initrd_start = INITRD_START + PAGE_OFFSET;
1166 INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
1167 initrd_end = initrd_start+INITRD_SIZE; 452 initrd_end = initrd_start+INITRD_SIZE;
1168 } 453 }
1169 else { 454 else {
@@ -1200,126 +485,6 @@ void __init remapped_pgdat_init(void)
1200 } 485 }
1201} 486}
1202 487
1203/*
1204 * Request address space for all standard RAM and ROM resources
1205 * and also for regions reported as reserved by the e820.
1206 */
1207static void __init
1208legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
1209{
1210 int i;
1211
1212 probe_roms();
1213 for (i = 0; i < e820.nr_map; i++) {
1214 struct resource *res;
1215#ifndef CONFIG_RESOURCES_64BIT
1216 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
1217 continue;
1218#endif
1219 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
1220 switch (e820.map[i].type) {
1221 case E820_RAM: res->name = "System RAM"; break;
1222 case E820_ACPI: res->name = "ACPI Tables"; break;
1223 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
1224 default: res->name = "reserved";
1225 }
1226 res->start = e820.map[i].addr;
1227 res->end = res->start + e820.map[i].size - 1;
1228 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1229 if (request_resource(&iomem_resource, res)) {
1230 kfree(res);
1231 continue;
1232 }
1233 if (e820.map[i].type == E820_RAM) {
1234 /*
1235 * We don't know which RAM region contains kernel data,
1236 * so we try it repeatedly and let the resource manager
1237 * test it.
1238 */
1239 request_resource(res, code_resource);
1240 request_resource(res, data_resource);
1241#ifdef CONFIG_KEXEC
1242 request_resource(res, &crashk_res);
1243#endif
1244 }
1245 }
1246}
1247
1248/*
1249 * Request address space for all standard resources
1250 *
1251 * This is called just before pcibios_init(), which is also a
1252 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
1253 */
1254static int __init request_standard_resources(void)
1255{
1256 int i;
1257
1258 printk("Setting up standard PCI resources\n");
1259 if (efi_enabled)
1260 efi_initialize_iomem_resources(&code_resource, &data_resource);
1261 else
1262 legacy_init_iomem_resources(&code_resource, &data_resource);
1263
1264 /* EFI systems may still have VGA */
1265 request_resource(&iomem_resource, &video_ram_resource);
1266
1267 /* request I/O space for devices used on all i[345]86 PCs */
1268 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
1269 request_resource(&ioport_resource, &standard_io_resources[i]);
1270 return 0;
1271}
1272
1273subsys_initcall(request_standard_resources);
1274
1275static void __init register_memory(void)
1276{
1277 unsigned long gapstart, gapsize, round;
1278 unsigned long long last;
1279 int i;
1280
1281 /*
1282 * Search for the bigest gap in the low 32 bits of the e820
1283 * memory space.
1284 */
1285 last = 0x100000000ull;
1286 gapstart = 0x10000000;
1287 gapsize = 0x400000;
1288 i = e820.nr_map;
1289 while (--i >= 0) {
1290 unsigned long long start = e820.map[i].addr;
1291 unsigned long long end = start + e820.map[i].size;
1292
1293 /*
1294 * Since "last" is at most 4GB, we know we'll
1295 * fit in 32 bits if this condition is true
1296 */
1297 if (last > end) {
1298 unsigned long gap = last - end;
1299
1300 if (gap > gapsize) {
1301 gapsize = gap;
1302 gapstart = end;
1303 }
1304 }
1305 if (start < last)
1306 last = start;
1307 }
1308
1309 /*
1310 * See how much we want to round up: start off with
1311 * rounding to the next 1MB area.
1312 */
1313 round = 0x100000;
1314 while ((gapsize >> 4) > round)
1315 round += round;
1316 /* Fun with two's complement */
1317 pci_mem_start = (gapstart + round) & -round;
1318
1319 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
1320 pci_mem_start, gapstart, gapsize);
1321}
1322
1323#ifdef CONFIG_MCA 488#ifdef CONFIG_MCA
1324static void set_mca_bus(int x) 489static void set_mca_bus(int x)
1325{ 490{
@@ -1329,6 +494,12 @@ static void set_mca_bus(int x)
1329static void set_mca_bus(int x) { } 494static void set_mca_bus(int x) { }
1330#endif 495#endif
1331 496
497/* Overridden in paravirt.c if CONFIG_PARAVIRT */
498char * __attribute__((weak)) memory_setup(void)
499{
500 return machine_specific_memory_setup();
501}
502
1332/* 503/*
1333 * Determine if we were loaded by an EFI loader. If so, then we have also been 504 * Determine if we were loaded by an EFI loader. If so, then we have also been
1334 * passed the efi memmap, systab, etc., so we should use these data structures 505 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -1381,7 +552,7 @@ void __init setup_arch(char **cmdline_p)
1381 efi_init(); 552 efi_init();
1382 else { 553 else {
1383 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 554 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1384 print_memory_map(machine_specific_memory_setup()); 555 print_memory_map(memory_setup());
1385 } 556 }
1386 557
1387 copy_edd(); 558 copy_edd();
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 43002cfb40c4..65d7620eaa09 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -128,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ 128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) 129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
130 130
131 GET_SEG(gs); 131 COPY_SEG(gs);
132 GET_SEG(fs); 132 GET_SEG(fs);
133 COPY_SEG(es); 133 COPY_SEG(es);
134 COPY_SEG(ds); 134 COPY_SEG(ds);
@@ -244,9 +244,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
244{ 244{
245 int tmp, err = 0; 245 int tmp, err = 0;
246 246
247 tmp = 0; 247 err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs);
248 savesegment(gs, tmp);
249 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
250 savesegment(fs, tmp); 248 savesegment(fs, tmp);
251 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 249 err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
252 250
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 31e5c6573aae..5285aff8367f 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -321,7 +321,6 @@ static inline void leave_mm (unsigned long cpu)
321 321
322fastcall void smp_invalidate_interrupt(struct pt_regs *regs) 322fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
323{ 323{
324 struct pt_regs *old_regs = set_irq_regs(regs);
325 unsigned long cpu; 324 unsigned long cpu;
326 325
327 cpu = get_cpu(); 326 cpu = get_cpu();
@@ -352,7 +351,6 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
352 smp_mb__after_clear_bit(); 351 smp_mb__after_clear_bit();
353out: 352out:
354 put_cpu_no_resched(); 353 put_cpu_no_resched();
355 set_irq_regs(old_regs);
356} 354}
357 355
358static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 356static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
@@ -607,14 +605,11 @@ void smp_send_stop(void)
607 */ 605 */
608fastcall void smp_reschedule_interrupt(struct pt_regs *regs) 606fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
609{ 607{
610 struct pt_regs *old_regs = set_irq_regs(regs);
611 ack_APIC_irq(); 608 ack_APIC_irq();
612 set_irq_regs(old_regs);
613} 609}
614 610
615fastcall void smp_call_function_interrupt(struct pt_regs *regs) 611fastcall void smp_call_function_interrupt(struct pt_regs *regs)
616{ 612{
617 struct pt_regs *old_regs = set_irq_regs(regs);
618 void (*func) (void *info) = call_data->func; 613 void (*func) (void *info) = call_data->func;
619 void *info = call_data->info; 614 void *info = call_data->info;
620 int wait = call_data->wait; 615 int wait = call_data->wait;
@@ -637,7 +632,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
637 mb(); 632 mb();
638 atomic_inc(&call_data->finished); 633 atomic_inc(&call_data->finished);
639 } 634 }
640 set_irq_regs(old_regs);
641} 635}
642 636
643/* 637/*
@@ -699,6 +693,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
699 put_cpu(); 693 put_cpu();
700 return -EBUSY; 694 return -EBUSY;
701 } 695 }
696
697 /* Can deadlock when called with interrupts disabled */
698 WARN_ON(irqs_disabled());
699
702 spin_lock_bh(&call_lock); 700 spin_lock_bh(&call_lock);
703 __smp_call_function_single(cpu, func, info, nonatomic, wait); 701 __smp_call_function_single(cpu, func, info, nonatomic, wait);
704 spin_unlock_bh(&call_lock); 702 spin_unlock_bh(&call_lock);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 4bb8b77cd65b..b0f84e5778ad 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -33,6 +33,11 @@
33 * Dave Jones : Report invalid combinations of Athlon CPUs. 33 * Dave Jones : Report invalid combinations of Athlon CPUs.
34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */ 34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
35 35
36
37/* SMP boot always wants to use real time delay to allow sufficient time for
38 * the APs to come online */
39#define USE_REAL_TIME_DELAY
40
36#include <linux/module.h> 41#include <linux/module.h>
37#include <linux/init.h> 42#include <linux/init.h>
38#include <linux/kernel.h> 43#include <linux/kernel.h>
@@ -52,6 +57,8 @@
52#include <asm/desc.h> 57#include <asm/desc.h>
53#include <asm/arch_hooks.h> 58#include <asm/arch_hooks.h>
54#include <asm/nmi.h> 59#include <asm/nmi.h>
60#include <asm/pda.h>
61#include <asm/genapic.h>
55 62
56#include <mach_apic.h> 63#include <mach_apic.h>
57#include <mach_wakecpu.h> 64#include <mach_wakecpu.h>
@@ -62,7 +69,7 @@ static int __devinitdata smp_b_stepping;
62 69
63/* Number of siblings per CPU package */ 70/* Number of siblings per CPU package */
64int smp_num_siblings = 1; 71int smp_num_siblings = 1;
65#ifdef CONFIG_X86_HT 72#ifdef CONFIG_SMP
66EXPORT_SYMBOL(smp_num_siblings); 73EXPORT_SYMBOL(smp_num_siblings);
67#endif 74#endif
68 75
@@ -536,11 +543,11 @@ set_cpu_sibling_map(int cpu)
536static void __devinit start_secondary(void *unused) 543static void __devinit start_secondary(void *unused)
537{ 544{
538 /* 545 /*
539 * Dont put anything before smp_callin(), SMP 546 * Don't put *anything* before secondary_cpu_init(), SMP
540 * booting is too fragile that we want to limit the 547 * booting is too fragile that we want to limit the
541 * things done here to the most necessary things. 548 * things done here to the most necessary things.
542 */ 549 */
543 cpu_init(); 550 secondary_cpu_init();
544 preempt_disable(); 551 preempt_disable();
545 smp_callin(); 552 smp_callin();
546 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 553 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
@@ -599,13 +606,16 @@ void __devinit initialize_secondary(void)
599 "movl %0,%%esp\n\t" 606 "movl %0,%%esp\n\t"
600 "jmp *%1" 607 "jmp *%1"
601 : 608 :
602 :"r" (current->thread.esp),"r" (current->thread.eip)); 609 :"m" (current->thread.esp),"m" (current->thread.eip));
603} 610}
604 611
612/* Static state in head.S used to set up a CPU */
605extern struct { 613extern struct {
606 void * esp; 614 void * esp;
607 unsigned short ss; 615 unsigned short ss;
608} stack_start; 616} stack_start;
617extern struct i386_pda *start_pda;
618extern struct Xgt_desc_struct cpu_gdt_descr;
609 619
610#ifdef CONFIG_NUMA 620#ifdef CONFIG_NUMA
611 621
@@ -936,9 +946,6 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
936 unsigned long start_eip; 946 unsigned long start_eip;
937 unsigned short nmi_high = 0, nmi_low = 0; 947 unsigned short nmi_high = 0, nmi_low = 0;
938 948
939 ++cpucount;
940 alternatives_smp_switch(1);
941
942 /* 949 /*
943 * We can't use kernel_thread since we must avoid to 950 * We can't use kernel_thread since we must avoid to
944 * reschedule the child. 951 * reschedule the child.
@@ -946,15 +953,30 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
946 idle = alloc_idle_task(cpu); 953 idle = alloc_idle_task(cpu);
947 if (IS_ERR(idle)) 954 if (IS_ERR(idle))
948 panic("failed fork for CPU %d", cpu); 955 panic("failed fork for CPU %d", cpu);
956
957 /* Pre-allocate and initialize the CPU's GDT and PDA so it
958 doesn't have to do any memory allocation during the
959 delicate CPU-bringup phase. */
960 if (!init_gdt(cpu, idle)) {
961 printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
962 return -1; /* ? */
963 }
964
949 idle->thread.eip = (unsigned long) start_secondary; 965 idle->thread.eip = (unsigned long) start_secondary;
950 /* start_eip had better be page-aligned! */ 966 /* start_eip had better be page-aligned! */
951 start_eip = setup_trampoline(); 967 start_eip = setup_trampoline();
952 968
969 ++cpucount;
970 alternatives_smp_switch(1);
971
953 /* So we see what's up */ 972 /* So we see what's up */
954 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); 973 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
955 /* Stack for startup_32 can be just as for start_secondary onwards */ 974 /* Stack for startup_32 can be just as for start_secondary onwards */
956 stack_start.esp = (void *) idle->thread.esp; 975 stack_start.esp = (void *) idle->thread.esp;
957 976
977 start_pda = cpu_pda(cpu);
978 cpu_gdt_descr = per_cpu(cpu_gdt_descr, cpu);
979
958 irq_ctx_init(cpu); 980 irq_ctx_init(cpu);
959 981
960 x86_cpu_to_apicid[cpu] = apicid; 982 x86_cpu_to_apicid[cpu] = apicid;
@@ -1049,13 +1071,15 @@ void cpu_exit_clear(void)
1049 1071
1050struct warm_boot_cpu_info { 1072struct warm_boot_cpu_info {
1051 struct completion *complete; 1073 struct completion *complete;
1074 struct work_struct task;
1052 int apicid; 1075 int apicid;
1053 int cpu; 1076 int cpu;
1054}; 1077};
1055 1078
1056static void __cpuinit do_warm_boot_cpu(void *p) 1079static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
1057{ 1080{
1058 struct warm_boot_cpu_info *info = p; 1081 struct warm_boot_cpu_info *info =
1082 container_of(work, struct warm_boot_cpu_info, task);
1059 do_boot_cpu(info->apicid, info->cpu); 1083 do_boot_cpu(info->apicid, info->cpu);
1060 complete(info->complete); 1084 complete(info->complete);
1061} 1085}
@@ -1064,7 +1088,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1064{ 1088{
1065 DECLARE_COMPLETION_ONSTACK(done); 1089 DECLARE_COMPLETION_ONSTACK(done);
1066 struct warm_boot_cpu_info info; 1090 struct warm_boot_cpu_info info;
1067 struct work_struct task;
1068 int apicid, ret; 1091 int apicid, ret;
1069 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); 1092 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1070 1093
@@ -1089,15 +1112,15 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1089 info.complete = &done; 1112 info.complete = &done;
1090 info.apicid = apicid; 1113 info.apicid = apicid;
1091 info.cpu = cpu; 1114 info.cpu = cpu;
1092 INIT_WORK(&task, do_warm_boot_cpu, &info); 1115 INIT_WORK(&info.task, do_warm_boot_cpu);
1093 1116
1094 tsc_sync_disabled = 1; 1117 tsc_sync_disabled = 1;
1095 1118
1096 /* init low mem mapping */ 1119 /* init low mem mapping */
1097 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 1120 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
1098 KERNEL_PGD_PTRS); 1121 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
1099 flush_tlb_all(); 1122 flush_tlb_all();
1100 schedule_work(&task); 1123 schedule_work(&info.task);
1101 wait_for_completion(&done); 1124 wait_for_completion(&done);
1102 1125
1103 tsc_sync_disabled = 0; 1126 tsc_sync_disabled = 0;
@@ -1108,34 +1131,15 @@ exit:
1108} 1131}
1109#endif 1132#endif
1110 1133
1111static void smp_tune_scheduling (void) 1134static void smp_tune_scheduling(void)
1112{ 1135{
1113 unsigned long cachesize; /* kB */ 1136 unsigned long cachesize; /* kB */
1114 unsigned long bandwidth = 350; /* MB/s */
1115 /*
1116 * Rough estimation for SMP scheduling, this is the number of
1117 * cycles it takes for a fully memory-limited process to flush
1118 * the SMP-local cache.
1119 *
1120 * (For a P5 this pretty much means we will choose another idle
1121 * CPU almost always at wakeup time (this is due to the small
1122 * L1 cache), on PIIs it's around 50-100 usecs, depending on
1123 * the cache size)
1124 */
1125 1137
1126 if (!cpu_khz) { 1138 if (cpu_khz) {
1127 /*
1128 * this basically disables processor-affinity
1129 * scheduling on SMP without a TSC.
1130 */
1131 return;
1132 } else {
1133 cachesize = boot_cpu_data.x86_cache_size; 1139 cachesize = boot_cpu_data.x86_cache_size;
1134 if (cachesize == -1) { 1140
1135 cachesize = 16; /* Pentiums, 2x8kB cache */ 1141 if (cachesize > 0)
1136 bandwidth = 100; 1142 max_cache_size = cachesize * 1024;
1137 }
1138 max_cache_size = cachesize * 1024;
1139 } 1143 }
1140} 1144}
1141 1145
@@ -1461,6 +1465,12 @@ int __devinit __cpu_up(unsigned int cpu)
1461 cpu_set(cpu, smp_commenced_mask); 1465 cpu_set(cpu, smp_commenced_mask);
1462 while (!cpu_isset(cpu, cpu_online_map)) 1466 while (!cpu_isset(cpu, cpu_online_map))
1463 cpu_relax(); 1467 cpu_relax();
1468
1469#ifdef CONFIG_X86_GENERICARCH
1470 if (num_online_cpus() > 8 && genapic == &apic_default)
1471 panic("Default flat APIC routing can't be used with > 8 cpus\n");
1472#endif
1473
1464 return 0; 1474 return 0;
1465} 1475}
1466 1476
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 713ba39d32c6..7de9117b5a3a 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -27,7 +27,11 @@
27 * Should the kernel map a VDSO page into processes and pass its 27 * Should the kernel map a VDSO page into processes and pass its
28 * address down to glibc upon exec()? 28 * address down to glibc upon exec()?
29 */ 29 */
30#ifdef CONFIG_PARAVIRT
31unsigned int __read_mostly vdso_enabled = 0;
32#else
30unsigned int __read_mostly vdso_enabled = 1; 33unsigned int __read_mostly vdso_enabled = 1;
34#endif
31 35
32EXPORT_SYMBOL_GPL(vdso_enabled); 36EXPORT_SYMBOL_GPL(vdso_enabled);
33 37
@@ -132,7 +136,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
132 goto up_fail; 136 goto up_fail;
133 } 137 }
134 138
135 vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL); 139 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
136 if (!vma) { 140 if (!vma) {
137 ret = -ENOMEM; 141 ret = -ENOMEM;
138 goto up_fail; 142 goto up_fail;
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 78af572fd17c..c505b16c0990 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -56,6 +56,7 @@
56#include <asm/uaccess.h> 56#include <asm/uaccess.h>
57#include <asm/processor.h> 57#include <asm/processor.h>
58#include <asm/timer.h> 58#include <asm/timer.h>
59#include <asm/time.h>
59 60
60#include "mach_time.h" 61#include "mach_time.h"
61 62
@@ -116,10 +117,7 @@ static int set_rtc_mmss(unsigned long nowtime)
116 /* gets recalled with irq locally disabled */ 117 /* gets recalled with irq locally disabled */
117 /* XXX - does irqsave resolve this? -johnstul */ 118 /* XXX - does irqsave resolve this? -johnstul */
118 spin_lock_irqsave(&rtc_lock, flags); 119 spin_lock_irqsave(&rtc_lock, flags);
119 if (efi_enabled) 120 retval = set_wallclock(nowtime);
120 retval = efi_set_rtc_mmss(nowtime);
121 else
122 retval = mach_set_rtc_mmss(nowtime);
123 spin_unlock_irqrestore(&rtc_lock, flags); 121 spin_unlock_irqrestore(&rtc_lock, flags);
124 122
125 return retval; 123 return retval;
@@ -223,10 +221,7 @@ unsigned long get_cmos_time(void)
223 221
224 spin_lock_irqsave(&rtc_lock, flags); 222 spin_lock_irqsave(&rtc_lock, flags);
225 223
226 if (efi_enabled) 224 retval = get_wallclock();
227 retval = efi_get_time();
228 else
229 retval = mach_get_cmos_time();
230 225
231 spin_unlock_irqrestore(&rtc_lock, flags); 226 spin_unlock_irqrestore(&rtc_lock, flags);
232 227
@@ -370,7 +365,7 @@ static void __init hpet_time_init(void)
370 printk("Using HPET for base-timer\n"); 365 printk("Using HPET for base-timer\n");
371 } 366 }
372 367
373 time_init_hook(); 368 do_time_init();
374} 369}
375#endif 370#endif
376 371
@@ -392,5 +387,5 @@ void __init time_init(void)
392 387
393 do_settimeofday(&ts); 388 do_settimeofday(&ts);
394 389
395 time_init_hook(); 390 do_time_init();
396} 391}
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index 1a2a979cf6a3..1e4702dfcd01 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -132,14 +132,20 @@ int __init hpet_enable(void)
132 * the single HPET timer for system time. 132 * the single HPET timer for system time.
133 */ 133 */
134#ifdef CONFIG_HPET_EMULATE_RTC 134#ifdef CONFIG_HPET_EMULATE_RTC
135 if (!(id & HPET_ID_NUMBER)) 135 if (!(id & HPET_ID_NUMBER)) {
136 iounmap(hpet_virt_address);
137 hpet_virt_address = NULL;
136 return -1; 138 return -1;
139 }
137#endif 140#endif
138 141
139 142
140 hpet_period = hpet_readl(HPET_PERIOD); 143 hpet_period = hpet_readl(HPET_PERIOD);
141 if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) 144 if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) {
145 iounmap(hpet_virt_address);
146 hpet_virt_address = NULL;
142 return -1; 147 return -1;
148 }
143 149
144 /* 150 /*
145 * 64 bit math 151 * 64 bit math
@@ -156,8 +162,11 @@ int __init hpet_enable(void)
156 162
157 hpet_use_timer = id & HPET_ID_LEGSUP; 163 hpet_use_timer = id & HPET_ID_LEGSUP;
158 164
159 if (hpet_timer_stop_set_go(hpet_tick)) 165 if (hpet_timer_stop_set_go(hpet_tick)) {
166 iounmap(hpet_virt_address);
167 hpet_virt_address = NULL;
160 return -1; 168 return -1;
169 }
161 170
162 use_hpet = 1; 171 use_hpet = 1;
163 172
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 07d6da36a825..79cf608e14ca 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -40,14 +40,18 @@ int arch_register_cpu(int num)
40 * restrictions and assumptions in kernel. This basically 40 * restrictions and assumptions in kernel. This basically
41 * doesnt add a control file, one cannot attempt to offline 41 * doesnt add a control file, one cannot attempt to offline
42 * BSP. 42 * BSP.
43 *
44 * Also certain PCI quirks require not to enable hotplug control
45 * for all CPU's.
43 */ 46 */
44 if (!num) 47 if (num && enable_cpu_hotplug)
45 cpu_devices[num].cpu.no_control = 1; 48 cpu_devices[num].cpu.hotpluggable = 1;
46 49
47 return register_cpu(&cpu_devices[num].cpu, num); 50 return register_cpu(&cpu_devices[num].cpu, num);
48} 51}
49 52
50#ifdef CONFIG_HOTPLUG_CPU 53#ifdef CONFIG_HOTPLUG_CPU
54int enable_cpu_hotplug = 1;
51 55
52void arch_unregister_cpu(int num) { 56void arch_unregister_cpu(int num) {
53 return unregister_cpu(&cpu_devices[num].cpu); 57 return unregister_cpu(&cpu_devices[num].cpu);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 00489b706d27..2b30dbf8d117 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -29,6 +29,8 @@
29#include <linux/kexec.h> 29#include <linux/kexec.h>
30#include <linux/unwind.h> 30#include <linux/unwind.h>
31#include <linux/uaccess.h> 31#include <linux/uaccess.h>
32#include <linux/nmi.h>
33#include <linux/bug.h>
32 34
33#ifdef CONFIG_EISA 35#ifdef CONFIG_EISA
34#include <linux/ioport.h> 36#include <linux/ioport.h>
@@ -61,9 +63,6 @@ int panic_on_unrecovered_nmi;
61 63
62asmlinkage int system_call(void); 64asmlinkage int system_call(void);
63 65
64struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
65 { 0, 0 }, { 0, 0 } };
66
67/* Do we ignore FPU interrupts ? */ 66/* Do we ignore FPU interrupts ? */
68char ignore_fpu_irq = 0; 67char ignore_fpu_irq = 0;
69 68
@@ -94,7 +93,7 @@ asmlinkage void alignment_check(void);
94asmlinkage void spurious_interrupt_bug(void); 93asmlinkage void spurious_interrupt_bug(void);
95asmlinkage void machine_check(void); 94asmlinkage void machine_check(void);
96 95
97static int kstack_depth_to_print = 24; 96int kstack_depth_to_print = 24;
98#ifdef CONFIG_STACK_UNWIND 97#ifdef CONFIG_STACK_UNWIND
99static int call_trace = 1; 98static int call_trace = 1;
100#else 99#else
@@ -129,15 +128,19 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
129 128
130#ifdef CONFIG_FRAME_POINTER 129#ifdef CONFIG_FRAME_POINTER
131 while (valid_stack_ptr(tinfo, (void *)ebp)) { 130 while (valid_stack_ptr(tinfo, (void *)ebp)) {
131 unsigned long new_ebp;
132 addr = *(unsigned long *)(ebp + 4); 132 addr = *(unsigned long *)(ebp + 4);
133 ops->address(data, addr); 133 ops->address(data, addr);
134 /* 134 /*
135 * break out of recursive entries (such as 135 * break out of recursive entries (such as
136 * end_of_stack_stop_unwind_function): 136 * end_of_stack_stop_unwind_function). Also,
137 * we can never allow a frame pointer to
138 * move downwards!
137 */ 139 */
138 if (ebp == *(unsigned long *)ebp) 140 new_ebp = *(unsigned long *)ebp;
141 if (new_ebp <= ebp)
139 break; 142 break;
140 ebp = *(unsigned long *)ebp; 143 ebp = new_ebp;
141 } 144 }
142#else 145#else
143 while (valid_stack_ptr(tinfo, stack)) { 146 while (valid_stack_ptr(tinfo, stack)) {
@@ -159,16 +162,25 @@ dump_trace_unwind(struct unwind_frame_info *info, void *data)
159{ 162{
160 struct ops_and_data *oad = (struct ops_and_data *)data; 163 struct ops_and_data *oad = (struct ops_and_data *)data;
161 int n = 0; 164 int n = 0;
165 unsigned long sp = UNW_SP(info);
162 166
167 if (arch_unw_user_mode(info))
168 return -1;
163 while (unwind(info) == 0 && UNW_PC(info)) { 169 while (unwind(info) == 0 && UNW_PC(info)) {
164 n++; 170 n++;
165 oad->ops->address(oad->data, UNW_PC(info)); 171 oad->ops->address(oad->data, UNW_PC(info));
166 if (arch_unw_user_mode(info)) 172 if (arch_unw_user_mode(info))
167 break; 173 break;
174 if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1))
175 && sp > UNW_SP(info))
176 break;
177 sp = UNW_SP(info);
168 } 178 }
169 return n; 179 return n;
170} 180}
171 181
182#define MSG(msg) ops->warning(data, msg)
183
172void dump_trace(struct task_struct *task, struct pt_regs *regs, 184void dump_trace(struct task_struct *task, struct pt_regs *regs,
173 unsigned long *stack, 185 unsigned long *stack,
174 struct stacktrace_ops *ops, void *data) 186 struct stacktrace_ops *ops, void *data)
@@ -187,29 +199,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
187 if (unwind_init_frame_info(&info, task, regs) == 0) 199 if (unwind_init_frame_info(&info, task, regs) == 0)
188 unw_ret = dump_trace_unwind(&info, &oad); 200 unw_ret = dump_trace_unwind(&info, &oad);
189 } else if (task == current) 201 } else if (task == current)
190 unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); 202 unw_ret = unwind_init_running(&info, dump_trace_unwind,
203 &oad);
191 else { 204 else {
192 if (unwind_init_blocked(&info, task) == 0) 205 if (unwind_init_blocked(&info, task) == 0)
193 unw_ret = dump_trace_unwind(&info, &oad); 206 unw_ret = dump_trace_unwind(&info, &oad);
194 } 207 }
195 if (unw_ret > 0) { 208 if (unw_ret > 0) {
196 if (call_trace == 1 && !arch_unw_user_mode(&info)) { 209 if (call_trace == 1 && !arch_unw_user_mode(&info)) {
197 ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", 210 ops->warning_symbol(data,
211 "DWARF2 unwinder stuck at %s",
198 UNW_PC(&info)); 212 UNW_PC(&info));
199 if (UNW_SP(&info) >= PAGE_OFFSET) { 213 if (UNW_SP(&info) >= PAGE_OFFSET) {
200 ops->warning(data, "Leftover inexact backtrace:\n"); 214 MSG("Leftover inexact backtrace:");
201 stack = (void *)UNW_SP(&info); 215 stack = (void *)UNW_SP(&info);
202 if (!stack) 216 if (!stack)
203 return; 217 return;
204 ebp = UNW_FP(&info); 218 ebp = UNW_FP(&info);
205 } else 219 } else
206 ops->warning(data, "Full inexact backtrace again:\n"); 220 MSG("Full inexact backtrace again:");
207 } else if (call_trace >= 1) 221 } else if (call_trace >= 1)
208 return; 222 return;
209 else 223 else
210 ops->warning(data, "Full inexact backtrace again:\n"); 224 MSG("Full inexact backtrace again:");
211 } else 225 } else
212 ops->warning(data, "Inexact backtrace:\n"); 226 MSG("Inexact backtrace:");
213 } 227 }
214 if (!stack) { 228 if (!stack) {
215 unsigned long dummy; 229 unsigned long dummy;
@@ -243,6 +257,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
243 stack = (unsigned long*)context->previous_esp; 257 stack = (unsigned long*)context->previous_esp;
244 if (!stack) 258 if (!stack)
245 break; 259 break;
260 touch_nmi_watchdog();
246 } 261 }
247} 262}
248EXPORT_SYMBOL(dump_trace); 263EXPORT_SYMBOL(dump_trace);
@@ -375,7 +390,7 @@ void show_registers(struct pt_regs *regs)
375 * time of the fault.. 390 * time of the fault..
376 */ 391 */
377 if (in_kernel) { 392 if (in_kernel) {
378 u8 __user *eip; 393 u8 *eip;
379 int code_bytes = 64; 394 int code_bytes = 64;
380 unsigned char c; 395 unsigned char c;
381 396
@@ -384,18 +399,20 @@ void show_registers(struct pt_regs *regs)
384 399
385 printk(KERN_EMERG "Code: "); 400 printk(KERN_EMERG "Code: ");
386 401
387 eip = (u8 __user *)regs->eip - 43; 402 eip = (u8 *)regs->eip - 43;
388 if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { 403 if (eip < (u8 *)PAGE_OFFSET ||
404 probe_kernel_address(eip, c)) {
389 /* try starting at EIP */ 405 /* try starting at EIP */
390 eip = (u8 __user *)regs->eip; 406 eip = (u8 *)regs->eip;
391 code_bytes = 32; 407 code_bytes = 32;
392 } 408 }
393 for (i = 0; i < code_bytes; i++, eip++) { 409 for (i = 0; i < code_bytes; i++, eip++) {
394 if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { 410 if (eip < (u8 *)PAGE_OFFSET ||
411 probe_kernel_address(eip, c)) {
395 printk(" Bad EIP value."); 412 printk(" Bad EIP value.");
396 break; 413 break;
397 } 414 }
398 if (eip == (u8 __user *)regs->eip) 415 if (eip == (u8 *)regs->eip)
399 printk("<%02x> ", c); 416 printk("<%02x> ", c);
400 else 417 else
401 printk("%02x ", c); 418 printk("%02x ", c);
@@ -404,43 +421,22 @@ void show_registers(struct pt_regs *regs)
404 printk("\n"); 421 printk("\n");
405} 422}
406 423
407static void handle_BUG(struct pt_regs *regs) 424int is_valid_bugaddr(unsigned long eip)
408{ 425{
409 unsigned long eip = regs->eip;
410 unsigned short ud2; 426 unsigned short ud2;
411 427
412 if (eip < PAGE_OFFSET) 428 if (eip < PAGE_OFFSET)
413 return; 429 return 0;
414 if (probe_kernel_address((unsigned short __user *)eip, ud2)) 430 if (probe_kernel_address((unsigned short *)eip, ud2))
415 return; 431 return 0;
416 if (ud2 != 0x0b0f)
417 return;
418
419 printk(KERN_EMERG "------------[ cut here ]------------\n");
420
421#ifdef CONFIG_DEBUG_BUGVERBOSE
422 do {
423 unsigned short line;
424 char *file;
425 char c;
426
427 if (probe_kernel_address((unsigned short __user *)(eip + 2),
428 line))
429 break;
430 if (__get_user(file, (char * __user *)(eip + 4)) ||
431 (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
432 file = "<bad filename>";
433 432
434 printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line); 433 return ud2 == 0x0b0f;
435 return;
436 } while (0);
437#endif
438 printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n");
439} 434}
440 435
441/* This is gone through when something in the kernel 436/*
442 * has done something bad and is about to be terminated. 437 * This is gone through when something in the kernel has done something bad and
443*/ 438 * is about to be terminated.
439 */
444void die(const char * str, struct pt_regs * regs, long err) 440void die(const char * str, struct pt_regs * regs, long err)
445{ 441{
446 static struct { 442 static struct {
@@ -448,7 +444,7 @@ void die(const char * str, struct pt_regs * regs, long err)
448 u32 lock_owner; 444 u32 lock_owner;
449 int lock_owner_depth; 445 int lock_owner_depth;
450 } die = { 446 } die = {
451 .lock = SPIN_LOCK_UNLOCKED, 447 .lock = __SPIN_LOCK_UNLOCKED(die.lock),
452 .lock_owner = -1, 448 .lock_owner = -1,
453 .lock_owner_depth = 0 449 .lock_owner_depth = 0
454 }; 450 };
@@ -472,7 +468,8 @@ void die(const char * str, struct pt_regs * regs, long err)
472 unsigned long esp; 468 unsigned long esp;
473 unsigned short ss; 469 unsigned short ss;
474 470
475 handle_BUG(regs); 471 report_bug(regs->eip);
472
476 printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); 473 printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
477#ifdef CONFIG_PREEMPT 474#ifdef CONFIG_PREEMPT
478 printk(KERN_EMERG "PREEMPT "); 475 printk(KERN_EMERG "PREEMPT ");
@@ -703,8 +700,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
703{ 700{
704 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " 701 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
705 "CPU %d.\n", reason, smp_processor_id()); 702 "CPU %d.\n", reason, smp_processor_id());
706 printk(KERN_EMERG "You probably have a hardware problem with your RAM " 703 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
707 "chips\n");
708 if (panic_on_unrecovered_nmi) 704 if (panic_on_unrecovered_nmi)
709 panic("NMI: Not continuing"); 705 panic("NMI: Not continuing");
710 706
@@ -769,7 +765,6 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
769 printk(" on CPU%d, eip %08lx, registers:\n", 765 printk(" on CPU%d, eip %08lx, registers:\n",
770 smp_processor_id(), regs->eip); 766 smp_processor_id(), regs->eip);
771 show_registers(regs); 767 show_registers(regs);
772 printk(KERN_EMERG "console shuts up ...\n");
773 console_silent(); 768 console_silent();
774 spin_unlock(&nmi_print_lock); 769 spin_unlock(&nmi_print_lock);
775 bust_spinlocks(0); 770 bust_spinlocks(0);
@@ -1084,49 +1079,24 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
1084#endif 1079#endif
1085} 1080}
1086 1081
1087fastcall void setup_x86_bogus_stack(unsigned char * stk) 1082fastcall unsigned long patch_espfix_desc(unsigned long uesp,
1088{ 1083 unsigned long kesp)
1089 unsigned long *switch16_ptr, *switch32_ptr;
1090 struct pt_regs *regs;
1091 unsigned long stack_top, stack_bot;
1092 unsigned short iret_frame16_off;
1093 int cpu = smp_processor_id();
1094 /* reserve the space on 32bit stack for the magic switch16 pointer */
1095 memmove(stk, stk + 8, sizeof(struct pt_regs));
1096 switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
1097 regs = (struct pt_regs *)stk;
1098 /* now the switch32 on 16bit stack */
1099 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
1100 stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
1101 switch32_ptr = (unsigned long *)(stack_top - 8);
1102 iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
1103 /* copy iret frame on 16bit stack */
1104 memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
1105 /* fill in the switch pointers */
1106 switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
1107 switch16_ptr[1] = __ESPFIX_SS;
1108 switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
1109 8 - CPU_16BIT_STACK_SIZE;
1110 switch32_ptr[1] = __KERNEL_DS;
1111}
1112
1113fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
1114{ 1084{
1115 unsigned long *switch32_ptr;
1116 unsigned char *stack16, *stack32;
1117 unsigned long stack_top, stack_bot;
1118 int len;
1119 int cpu = smp_processor_id(); 1085 int cpu = smp_processor_id();
1120 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); 1086 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1121 stack_top = stack_bot + CPU_16BIT_STACK_SIZE; 1087 struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
1122 switch32_ptr = (unsigned long *)(stack_top - 8); 1088 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
1123 /* copy the data from 16bit stack to 32bit stack */ 1089 unsigned long new_kesp = kesp - base;
1124 len = CPU_16BIT_STACK_SIZE - 8 - sp; 1090 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
1125 stack16 = (unsigned char *)(stack_bot + sp); 1091 __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
1126 stack32 = (unsigned char *) 1092 /* Set up base for espfix segment */
1127 (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len); 1093 desc &= 0x00f0ff0000000000ULL;
1128 memcpy(stack32, stack16, len); 1094 desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
1129 return stack32; 1095 ((((__u64)base) << 32) & 0xff00000000000000ULL) |
1096 ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
1097 (lim_pages & 0xffff);
1098 *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
1099 return new_kesp;
1130} 1100}
1131 1101
1132/* 1102/*
@@ -1139,7 +1109,7 @@ fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
1139 * Must be called with kernel preemption disabled (in this case, 1109 * Must be called with kernel preemption disabled (in this case,
1140 * local interrupts are disabled at the call-site in entry.S). 1110 * local interrupts are disabled at the call-site in entry.S).
1141 */ 1111 */
1142asmlinkage void math_state_restore(struct pt_regs regs) 1112asmlinkage void math_state_restore(void)
1143{ 1113{
1144 struct thread_info *thread = current_thread_info(); 1114 struct thread_info *thread = current_thread_info();
1145 struct task_struct *tsk = thread->task; 1115 struct task_struct *tsk = thread->task;
@@ -1149,6 +1119,7 @@ asmlinkage void math_state_restore(struct pt_regs regs)
1149 init_fpu(tsk); 1119 init_fpu(tsk);
1150 restore_fpu(tsk); 1120 restore_fpu(tsk);
1151 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 1121 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1122 tsk->fpu_counter++;
1152} 1123}
1153 1124
1154#ifndef CONFIG_MATH_EMULATION 1125#ifndef CONFIG_MATH_EMULATION
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index b8fa0a8b2e47..1bbe45dca7a0 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -13,7 +13,6 @@
13 13
14#include <asm/delay.h> 14#include <asm/delay.h>
15#include <asm/tsc.h> 15#include <asm/tsc.h>
16#include <asm/delay.h>
17#include <asm/io.h> 16#include <asm/io.h>
18 17
19#include "mach_timer.h" 18#include "mach_timer.h"
@@ -217,7 +216,7 @@ static unsigned int cpufreq_delayed_issched = 0;
217static unsigned int cpufreq_init = 0; 216static unsigned int cpufreq_init = 0;
218static struct work_struct cpufreq_delayed_get_work; 217static struct work_struct cpufreq_delayed_get_work;
219 218
220static void handle_cpufreq_delayed_get(void *v) 219static void handle_cpufreq_delayed_get(struct work_struct *work)
221{ 220{
222 unsigned int cpu; 221 unsigned int cpu;
223 222
@@ -306,7 +305,7 @@ static int __init cpufreq_tsc(void)
306{ 305{
307 int ret; 306 int ret;
308 307
309 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); 308 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
310 ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, 309 ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
311 CPUFREQ_TRANSITION_NOTIFIER); 310 CPUFREQ_TRANSITION_NOTIFIER);
312 if (!ret) 311 if (!ret)
@@ -349,8 +348,8 @@ static int tsc_update_callback(void)
349 int change = 0; 348 int change = 0;
350 349
351 /* check to see if we should switch to the safe clocksource: */ 350 /* check to see if we should switch to the safe clocksource: */
352 if (clocksource_tsc.rating != 50 && check_tsc_unstable()) { 351 if (clocksource_tsc.rating != 0 && check_tsc_unstable()) {
353 clocksource_tsc.rating = 50; 352 clocksource_tsc.rating = 0;
354 clocksource_reselect(); 353 clocksource_reselect();
355 change = 1; 354 change = 1;
356 } 355 }
@@ -461,7 +460,7 @@ static int __init init_tsc_clocksource(void)
461 clocksource_tsc.shift); 460 clocksource_tsc.shift);
462 /* lower the rating if we already know its unstable: */ 461 /* lower the rating if we already know its unstable: */
463 if (check_tsc_unstable()) 462 if (check_tsc_unstable())
464 clocksource_tsc.rating = 50; 463 clocksource_tsc.rating = 0;
465 464
466 init_timer(&verify_tsc_freq_timer); 465 init_timer(&verify_tsc_freq_timer);
467 verify_tsc_freq_timer.function = verify_tsc_freq; 466 verify_tsc_freq_timer.function = verify_tsc_freq;
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index cbcd61d6120b..be2f96e67f78 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -43,6 +43,7 @@
43#include <linux/highmem.h> 43#include <linux/highmem.h>
44#include <linux/ptrace.h> 44#include <linux/ptrace.h>
45#include <linux/audit.h> 45#include <linux/audit.h>
46#include <linux/stddef.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48#include <asm/io.h> 49#include <asm/io.h>
@@ -72,10 +73,10 @@
72/* 73/*
73 * 8- and 16-bit register defines.. 74 * 8- and 16-bit register defines..
74 */ 75 */
75#define AL(regs) (((unsigned char *)&((regs)->eax))[0]) 76#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0])
76#define AH(regs) (((unsigned char *)&((regs)->eax))[1]) 77#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1])
77#define IP(regs) (*(unsigned short *)&((regs)->eip)) 78#define IP(regs) (*(unsigned short *)&((regs)->pt.eip))
78#define SP(regs) (*(unsigned short *)&((regs)->esp)) 79#define SP(regs) (*(unsigned short *)&((regs)->pt.esp))
79 80
80/* 81/*
81 * virtual flags (16 and 32-bit versions) 82 * virtual flags (16 and 32-bit versions)
@@ -89,10 +90,37 @@
89#define SAFE_MASK (0xDD5) 90#define SAFE_MASK (0xDD5)
90#define RETURN_MASK (0xDFF) 91#define RETURN_MASK (0xDFF)
91 92
92#define VM86_REGS_PART2 orig_eax 93/* convert kernel_vm86_regs to vm86_regs */
93#define VM86_REGS_SIZE1 \ 94static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
94 ( (unsigned)( & (((struct kernel_vm86_regs *)0)->VM86_REGS_PART2) ) ) 95 const struct kernel_vm86_regs *regs)
95#define VM86_REGS_SIZE2 (sizeof(struct kernel_vm86_regs) - VM86_REGS_SIZE1) 96{
97 int ret = 0;
98
99 /* kernel_vm86_regs is missing xfs, so copy everything up to
100 (but not including) xgs, and then rest after xgs. */
101 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs));
102 ret += copy_to_user(&user->__null_gs, &regs->pt.xgs,
103 sizeof(struct kernel_vm86_regs) -
104 offsetof(struct kernel_vm86_regs, pt.xgs));
105
106 return ret;
107}
108
109/* convert vm86_regs to kernel_vm86_regs */
110static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
111 const struct vm86_regs __user *user,
112 unsigned extra)
113{
114 int ret = 0;
115
116 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs));
117 ret += copy_from_user(&regs->pt.xgs, &user->__null_gs,
118 sizeof(struct kernel_vm86_regs) -
119 offsetof(struct kernel_vm86_regs, pt.xgs) +
120 extra);
121
122 return ret;
123}
96 124
97struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); 125struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
98struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) 126struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
@@ -112,10 +140,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
112 printk("no vm86_info: BAD\n"); 140 printk("no vm86_info: BAD\n");
113 do_exit(SIGSEGV); 141 do_exit(SIGSEGV);
114 } 142 }
115 set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); 143 set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
116 tmp = copy_to_user(&current->thread.vm86_info->regs,regs, VM86_REGS_SIZE1); 144 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
117 tmp += copy_to_user(&current->thread.vm86_info->regs.VM86_REGS_PART2,
118 &regs->VM86_REGS_PART2, VM86_REGS_SIZE2);
119 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap); 145 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
120 if (tmp) { 146 if (tmp) {
121 printk("vm86: could not access userspace vm86_info\n"); 147 printk("vm86: could not access userspace vm86_info\n");
@@ -129,9 +155,11 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
129 current->thread.saved_esp0 = 0; 155 current->thread.saved_esp0 = 0;
130 put_cpu(); 156 put_cpu();
131 157
132 loadsegment(fs, current->thread.saved_fs);
133 loadsegment(gs, current->thread.saved_gs);
134 ret = KVM86->regs32; 158 ret = KVM86->regs32;
159
160 loadsegment(fs, current->thread.saved_fs);
161 ret->xgs = current->thread.saved_gs;
162
135 return ret; 163 return ret;
136} 164}
137 165
@@ -183,9 +211,9 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
183 tsk = current; 211 tsk = current;
184 if (tsk->thread.saved_esp0) 212 if (tsk->thread.saved_esp0)
185 goto out; 213 goto out;
186 tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); 214 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
187 tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, 215 offsetof(struct kernel_vm86_struct, vm86plus) -
188 (long)&info.vm86plus - (long)&info.regs.VM86_REGS_PART2); 216 sizeof(info.regs));
189 ret = -EFAULT; 217 ret = -EFAULT;
190 if (tmp) 218 if (tmp)
191 goto out; 219 goto out;
@@ -233,9 +261,9 @@ asmlinkage int sys_vm86(struct pt_regs regs)
233 if (tsk->thread.saved_esp0) 261 if (tsk->thread.saved_esp0)
234 goto out; 262 goto out;
235 v86 = (struct vm86plus_struct __user *)regs.ecx; 263 v86 = (struct vm86plus_struct __user *)regs.ecx;
236 tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); 264 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
237 tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, 265 offsetof(struct kernel_vm86_struct, regs32) -
238 (long)&info.regs32 - (long)&info.regs.VM86_REGS_PART2); 266 sizeof(info.regs));
239 ret = -EFAULT; 267 ret = -EFAULT;
240 if (tmp) 268 if (tmp)
241 goto out; 269 goto out;
@@ -252,15 +280,15 @@ out:
252static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) 280static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
253{ 281{
254 struct tss_struct *tss; 282 struct tss_struct *tss;
255 long eax;
256/* 283/*
257 * make sure the vm86() system call doesn't try to do anything silly 284 * make sure the vm86() system call doesn't try to do anything silly
258 */ 285 */
259 info->regs.__null_ds = 0; 286 info->regs.pt.xds = 0;
260 info->regs.__null_es = 0; 287 info->regs.pt.xes = 0;
288 info->regs.pt.xgs = 0;
261 289
262/* we are clearing fs,gs later just before "jmp resume_userspace", 290/* we are clearing fs later just before "jmp resume_userspace",
263 * because starting with Linux 2.1.x they aren't no longer saved/restored 291 * because it is not saved/restored.
264 */ 292 */
265 293
266/* 294/*
@@ -268,10 +296,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
268 * has set it up safely, so this makes sure interrupt etc flags are 296 * has set it up safely, so this makes sure interrupt etc flags are
269 * inherited from protected mode. 297 * inherited from protected mode.
270 */ 298 */
271 VEFLAGS = info->regs.eflags; 299 VEFLAGS = info->regs.pt.eflags;
272 info->regs.eflags &= SAFE_MASK; 300 info->regs.pt.eflags &= SAFE_MASK;
273 info->regs.eflags |= info->regs32->eflags & ~SAFE_MASK; 301 info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
274 info->regs.eflags |= VM_MASK; 302 info->regs.pt.eflags |= VM_MASK;
275 303
276 switch (info->cpu_type) { 304 switch (info->cpu_type) {
277 case CPU_286: 305 case CPU_286:
@@ -294,7 +322,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
294 info->regs32->eax = 0; 322 info->regs32->eax = 0;
295 tsk->thread.saved_esp0 = tsk->thread.esp0; 323 tsk->thread.saved_esp0 = tsk->thread.esp0;
296 savesegment(fs, tsk->thread.saved_fs); 324 savesegment(fs, tsk->thread.saved_fs);
297 savesegment(gs, tsk->thread.saved_gs); 325 tsk->thread.saved_gs = info->regs32->xgs;
298 326
299 tss = &per_cpu(init_tss, get_cpu()); 327 tss = &per_cpu(init_tss, get_cpu());
300 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; 328 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -306,19 +334,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
306 tsk->thread.screen_bitmap = info->screen_bitmap; 334 tsk->thread.screen_bitmap = info->screen_bitmap;
307 if (info->flags & VM86_SCREEN_BITMAP) 335 if (info->flags & VM86_SCREEN_BITMAP)
308 mark_screen_rdonly(tsk->mm); 336 mark_screen_rdonly(tsk->mm);
309 __asm__ __volatile__("xorl %eax,%eax; movl %eax,%fs; movl %eax,%gs\n\t");
310 __asm__ __volatile__("movl %%eax, %0\n" :"=r"(eax));
311 337
312 /*call audit_syscall_exit since we do not exit via the normal paths */ 338 /*call audit_syscall_exit since we do not exit via the normal paths */
313 if (unlikely(current->audit_context)) 339 if (unlikely(current->audit_context))
314 audit_syscall_exit(AUDITSC_RESULT(eax), eax); 340 audit_syscall_exit(AUDITSC_RESULT(0), 0);
315 341
316 __asm__ __volatile__( 342 __asm__ __volatile__(
317 "movl %0,%%esp\n\t" 343 "movl %0,%%esp\n\t"
318 "movl %1,%%ebp\n\t" 344 "movl %1,%%ebp\n\t"
345 "mov %2, %%fs\n\t"
319 "jmp resume_userspace" 346 "jmp resume_userspace"
320 : /* no outputs */ 347 : /* no outputs */
321 :"r" (&info->regs), "r" (task_thread_info(tsk))); 348 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
322 /* we never return here */ 349 /* we never return here */
323} 350}
324 351
@@ -348,12 +375,12 @@ static inline void clear_IF(struct kernel_vm86_regs * regs)
348 375
349static inline void clear_TF(struct kernel_vm86_regs * regs) 376static inline void clear_TF(struct kernel_vm86_regs * regs)
350{ 377{
351 regs->eflags &= ~TF_MASK; 378 regs->pt.eflags &= ~TF_MASK;
352} 379}
353 380
354static inline void clear_AC(struct kernel_vm86_regs * regs) 381static inline void clear_AC(struct kernel_vm86_regs * regs)
355{ 382{
356 regs->eflags &= ~AC_MASK; 383 regs->pt.eflags &= ~AC_MASK;
357} 384}
358 385
359/* It is correct to call set_IF(regs) from the set_vflags_* 386/* It is correct to call set_IF(regs) from the set_vflags_*
@@ -370,7 +397,7 @@ static inline void clear_AC(struct kernel_vm86_regs * regs)
370static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) 397static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
371{ 398{
372 set_flags(VEFLAGS, eflags, current->thread.v86mask); 399 set_flags(VEFLAGS, eflags, current->thread.v86mask);
373 set_flags(regs->eflags, eflags, SAFE_MASK); 400 set_flags(regs->pt.eflags, eflags, SAFE_MASK);
374 if (eflags & IF_MASK) 401 if (eflags & IF_MASK)
375 set_IF(regs); 402 set_IF(regs);
376 else 403 else
@@ -380,7 +407,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs
380static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) 407static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
381{ 408{
382 set_flags(VFLAGS, flags, current->thread.v86mask); 409 set_flags(VFLAGS, flags, current->thread.v86mask);
383 set_flags(regs->eflags, flags, SAFE_MASK); 410 set_flags(regs->pt.eflags, flags, SAFE_MASK);
384 if (flags & IF_MASK) 411 if (flags & IF_MASK)
385 set_IF(regs); 412 set_IF(regs);
386 else 413 else
@@ -389,7 +416,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg
389 416
390static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) 417static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
391{ 418{
392 unsigned long flags = regs->eflags & RETURN_MASK; 419 unsigned long flags = regs->pt.eflags & RETURN_MASK;
393 420
394 if (VEFLAGS & VIF_MASK) 421 if (VEFLAGS & VIF_MASK)
395 flags |= IF_MASK; 422 flags |= IF_MASK;
@@ -493,7 +520,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
493 unsigned long __user *intr_ptr; 520 unsigned long __user *intr_ptr;
494 unsigned long segoffs; 521 unsigned long segoffs;
495 522
496 if (regs->cs == BIOSSEG) 523 if (regs->pt.xcs == BIOSSEG)
497 goto cannot_handle; 524 goto cannot_handle;
498 if (is_revectored(i, &KVM86->int_revectored)) 525 if (is_revectored(i, &KVM86->int_revectored))
499 goto cannot_handle; 526 goto cannot_handle;
@@ -505,9 +532,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
505 if ((segoffs >> 16) == BIOSSEG) 532 if ((segoffs >> 16) == BIOSSEG)
506 goto cannot_handle; 533 goto cannot_handle;
507 pushw(ssp, sp, get_vflags(regs), cannot_handle); 534 pushw(ssp, sp, get_vflags(regs), cannot_handle);
508 pushw(ssp, sp, regs->cs, cannot_handle); 535 pushw(ssp, sp, regs->pt.xcs, cannot_handle);
509 pushw(ssp, sp, IP(regs), cannot_handle); 536 pushw(ssp, sp, IP(regs), cannot_handle);
510 regs->cs = segoffs >> 16; 537 regs->pt.xcs = segoffs >> 16;
511 SP(regs) -= 6; 538 SP(regs) -= 6;
512 IP(regs) = segoffs & 0xffff; 539 IP(regs) = segoffs & 0xffff;
513 clear_TF(regs); 540 clear_TF(regs);
@@ -524,7 +551,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno
524 if (VMPI.is_vm86pus) { 551 if (VMPI.is_vm86pus) {
525 if ( (trapno==3) || (trapno==1) ) 552 if ( (trapno==3) || (trapno==1) )
526 return_to_32bit(regs, VM86_TRAP + (trapno << 8)); 553 return_to_32bit(regs, VM86_TRAP + (trapno << 8));
527 do_int(regs, trapno, (unsigned char __user *) (regs->ss << 4), SP(regs)); 554 do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
528 return 0; 555 return 0;
529 } 556 }
530 if (trapno !=1) 557 if (trapno !=1)
@@ -560,10 +587,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
560 handle_vm86_trap(regs, 0, 1); \ 587 handle_vm86_trap(regs, 0, 1); \
561 return; } while (0) 588 return; } while (0)
562 589
563 orig_flags = *(unsigned short *)&regs->eflags; 590 orig_flags = *(unsigned short *)&regs->pt.eflags;
564 591
565 csp = (unsigned char __user *) (regs->cs << 4); 592 csp = (unsigned char __user *) (regs->pt.xcs << 4);
566 ssp = (unsigned char __user *) (regs->ss << 4); 593 ssp = (unsigned char __user *) (regs->pt.xss << 4);
567 sp = SP(regs); 594 sp = SP(regs);
568 ip = IP(regs); 595 ip = IP(regs);
569 596
@@ -650,7 +677,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
650 SP(regs) += 6; 677 SP(regs) += 6;
651 } 678 }
652 IP(regs) = newip; 679 IP(regs) = newip;
653 regs->cs = newcs; 680 regs->pt.xcs = newcs;
654 CHECK_IF_IN_TRAP; 681 CHECK_IF_IN_TRAP;
655 if (data32) { 682 if (data32) {
656 set_vflags_long(newflags, regs); 683 set_vflags_long(newflags, regs);
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 1e7ac1c44ddc..a53c8b1854b5 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -1,18 +1,32 @@
1/* ld script to make i386 Linux kernel 1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; 2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 *
4 * Don't define absolute symbols until and unless you know that symbol
5 * value is should remain constant even if kernel image is relocated
6 * at run time. Absolute symbols are not relocated. If symbol value should
7 * change if kernel is relocated, make the symbol section relative and
8 * put it inside the section definition.
3 */ 9 */
4 10
11/* Don't define absolute symbols until and unless you know that symbol
12 * value is should remain constant even if kernel image is relocated
13 * at run time. Absolute symbols are not relocated. If symbol value should
14 * change if kernel is relocated, make the symbol section relative and
15 * put it inside the section definition.
16 */
5#define LOAD_OFFSET __PAGE_OFFSET 17#define LOAD_OFFSET __PAGE_OFFSET
6 18
7#include <asm-generic/vmlinux.lds.h> 19#include <asm-generic/vmlinux.lds.h>
8#include <asm/thread_info.h> 20#include <asm/thread_info.h>
9#include <asm/page.h> 21#include <asm/page.h>
10#include <asm/cache.h> 22#include <asm/cache.h>
23#include <asm/boot.h>
11 24
12OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") 25OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
13OUTPUT_ARCH(i386) 26OUTPUT_ARCH(i386)
14ENTRY(phys_startup_32) 27ENTRY(phys_startup_32)
15jiffies = jiffies_64; 28jiffies = jiffies_64;
29_proxy_pda = 0;
16 30
17PHDRS { 31PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */ 32 text PT_LOAD FLAGS(5); /* R_E */
@@ -21,46 +35,58 @@ PHDRS {
21} 35}
22SECTIONS 36SECTIONS
23{ 37{
24 . = __KERNEL_START; 38 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
25 phys_startup_32 = startup_32 - LOAD_OFFSET; 39 phys_startup_32 = startup_32 - LOAD_OFFSET;
26 /* read-only */ 40 /* read-only */
27 _text = .; /* Text and read-only data */
28 .text : AT(ADDR(.text) - LOAD_OFFSET) { 41 .text : AT(ADDR(.text) - LOAD_OFFSET) {
42 _text = .; /* Text and read-only data */
29 *(.text) 43 *(.text)
30 SCHED_TEXT 44 SCHED_TEXT
31 LOCK_TEXT 45 LOCK_TEXT
32 KPROBES_TEXT 46 KPROBES_TEXT
33 *(.fixup) 47 *(.fixup)
34 *(.gnu.warning) 48 *(.gnu.warning)
35 } :text = 0x9090 49 _etext = .; /* End of text section */
36 50 } :text = 0x9090
37 _etext = .; /* End of text section */
38 51
39 . = ALIGN(16); /* Exception table */ 52 . = ALIGN(16); /* Exception table */
40 __start___ex_table = .; 53 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
41 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } 54 __start___ex_table = .;
42 __stop___ex_table = .; 55 *(__ex_table)
56 __stop___ex_table = .;
57 }
43 58
44 RODATA 59 RODATA
45 60
61 BUG_TABLE
62
46 . = ALIGN(4); 63 . = ALIGN(4);
47 __tracedata_start = .;
48 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { 64 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
65 __tracedata_start = .;
49 *(.tracedata) 66 *(.tracedata)
67 __tracedata_end = .;
50 } 68 }
51 __tracedata_end = .;
52 69
53 /* writeable */ 70 /* writeable */
71 . = ALIGN(4096);
54 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ 72 .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */
55 *(.data) 73 *(.data)
56 CONSTRUCTORS 74 CONSTRUCTORS
57 } :data 75 } :data
58 76
77 .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) {
78 __start_paravirtprobe = .;
79 *(.paravirtprobe)
80 __stop_paravirtprobe = .;
81 }
82
59 . = ALIGN(4096); 83 . = ALIGN(4096);
60 __nosave_begin = .; 84 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
61 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } 85 __nosave_begin = .;
62 . = ALIGN(4096); 86 *(.data.nosave)
63 __nosave_end = .; 87 . = ALIGN(4096);
88 __nosave_end = .;
89 }
64 90
65 . = ALIGN(4096); 91 . = ALIGN(4096);
66 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { 92 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
@@ -74,17 +100,10 @@ SECTIONS
74 100
75 /* rarely changed data like cpu maps */ 101 /* rarely changed data like cpu maps */
76 . = ALIGN(32); 102 . = ALIGN(32);
77 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } 103 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
78 _edata = .; /* End of data section */ 104 *(.data.read_mostly)
79 105 _edata = .; /* End of data section */
80#ifdef CONFIG_STACK_UNWIND
81 . = ALIGN(4);
82 .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
83 __start_unwind = .;
84 *(.eh_frame)
85 __end_unwind = .;
86 } 106 }
87#endif
88 107
89 . = ALIGN(THREAD_SIZE); /* init_task */ 108 . = ALIGN(THREAD_SIZE); /* init_task */
90 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 109 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
@@ -93,94 +112,102 @@ SECTIONS
93 112
94 /* might get freed after init */ 113 /* might get freed after init */
95 . = ALIGN(4096); 114 . = ALIGN(4096);
96 __smp_alt_begin = .;
97 __smp_alt_instructions = .;
98 .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { 115 .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
116 __smp_alt_begin = .;
117 __smp_alt_instructions = .;
99 *(.smp_altinstructions) 118 *(.smp_altinstructions)
119 __smp_alt_instructions_end = .;
100 } 120 }
101 __smp_alt_instructions_end = .;
102 . = ALIGN(4); 121 . = ALIGN(4);
103 __smp_locks = .;
104 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 122 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
123 __smp_locks = .;
105 *(.smp_locks) 124 *(.smp_locks)
125 __smp_locks_end = .;
106 } 126 }
107 __smp_locks_end = .;
108 .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { 127 .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
109 *(.smp_altinstr_replacement) 128 *(.smp_altinstr_replacement)
129 __smp_alt_end = .;
110 } 130 }
131 /* will be freed after init
132 * Following ALIGN() is required to make sure no other data falls on the
133 * same page where __smp_alt_end is pointing as that page might be freed
134 * after boot. Always make sure that ALIGN() directive is present after
135 * the section which contains __smp_alt_end.
136 */
111 . = ALIGN(4096); 137 . = ALIGN(4096);
112 __smp_alt_end = .;
113 138
114 /* will be freed after init */ 139 /* will be freed after init */
115 . = ALIGN(4096); /* Init code and data */ 140 . = ALIGN(4096); /* Init code and data */
116 __init_begin = .;
117 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 141 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
142 __init_begin = .;
118 _sinittext = .; 143 _sinittext = .;
119 *(.init.text) 144 *(.init.text)
120 _einittext = .; 145 _einittext = .;
121 } 146 }
122 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } 147 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
123 . = ALIGN(16); 148 . = ALIGN(16);
124 __setup_start = .; 149 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
125 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } 150 __setup_start = .;
126 __setup_end = .; 151 *(.init.setup)
127 __initcall_start = .; 152 __setup_end = .;
153 }
128 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { 154 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
129 *(.initcall1.init) 155 __initcall_start = .;
130 *(.initcall2.init) 156 INITCALLS
131 *(.initcall3.init) 157 __initcall_end = .;
132 *(.initcall4.init) 158 }
133 *(.initcall5.init)
134 *(.initcall6.init)
135 *(.initcall7.init)
136 }
137 __initcall_end = .;
138 __con_initcall_start = .;
139 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { 159 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
160 __con_initcall_start = .;
140 *(.con_initcall.init) 161 *(.con_initcall.init)
162 __con_initcall_end = .;
141 } 163 }
142 __con_initcall_end = .;
143 SECURITY_INIT 164 SECURITY_INIT
144 . = ALIGN(4); 165 . = ALIGN(4);
145 __alt_instructions = .;
146 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 166 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
167 __alt_instructions = .;
147 *(.altinstructions) 168 *(.altinstructions)
169 __alt_instructions_end = .;
148 } 170 }
149 __alt_instructions_end = .;
150 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 171 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
151 *(.altinstr_replacement) 172 *(.altinstr_replacement)
152 } 173 }
174 . = ALIGN(4);
175 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
176 __start_parainstructions = .;
177 *(.parainstructions)
178 __stop_parainstructions = .;
179 }
153 /* .exit.text is discard at runtime, not link time, to deal with references 180 /* .exit.text is discard at runtime, not link time, to deal with references
154 from .altinstructions and .eh_frame */ 181 from .altinstructions and .eh_frame */
155 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } 182 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
156 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } 183 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
157 . = ALIGN(4096); 184 . = ALIGN(4096);
158 __initramfs_start = .; 185 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
159 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } 186 __initramfs_start = .;
160 __initramfs_end = .; 187 *(.init.ramfs)
188 __initramfs_end = .;
189 }
161 . = ALIGN(L1_CACHE_BYTES); 190 . = ALIGN(L1_CACHE_BYTES);
162 __per_cpu_start = .; 191 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
163 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } 192 __per_cpu_start = .;
164 __per_cpu_end = .; 193 *(.data.percpu)
194 __per_cpu_end = .;
195 }
165 . = ALIGN(4096); 196 . = ALIGN(4096);
166 __init_end = .;
167 /* freed after init ends here */ 197 /* freed after init ends here */
168 198
169 __bss_start = .; /* BSS */
170 .bss.page_aligned : AT(ADDR(.bss.page_aligned) - LOAD_OFFSET) {
171 *(.bss.page_aligned)
172 }
173 .bss : AT(ADDR(.bss) - LOAD_OFFSET) { 199 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
200 __init_end = .;
201 __bss_start = .; /* BSS */
202 *(.bss.page_aligned)
174 *(.bss) 203 *(.bss)
204 . = ALIGN(4);
205 __bss_stop = .;
206 _end = . ;
207 /* This is where the kernel creates the early boot page tables */
208 . = ALIGN(4096);
209 pg0 = . ;
175 } 210 }
176 . = ALIGN(4);
177 __bss_stop = .;
178
179 _end = . ;
180
181 /* This is where the kernel creates the early boot page tables */
182 . = ALIGN(4096);
183 pg0 = .;
184 211
185 /* Sections to be discarded */ 212 /* Sections to be discarded */
186 /DISCARD/ : { 213 /DISCARD/ : {