aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r--arch/i386/kernel/Makefile5
-rw-r--r--arch/i386/kernel/acpi/boot.c32
-rw-r--r--arch/i386/kernel/acpi/cstate.c17
-rw-r--r--arch/i386/kernel/acpi/earlyquirk.c21
-rw-r--r--arch/i386/kernel/alternative.c64
-rw-r--r--arch/i386/kernel/apic.c22
-rw-r--r--arch/i386/kernel/apm.c11
-rw-r--r--arch/i386/kernel/asm-offsets.c39
-rw-r--r--arch/i386/kernel/cpu/amd.c5
-rw-r--r--arch/i386/kernel/cpu/common.c251
-rw-r--r--arch/i386/kernel/cpu/cpufreq/Kconfig6
-rw-r--r--arch/i386/kernel/cpu/cpufreq/Makefile2
-rw-r--r--arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c810
-rw-r--r--arch/i386/kernel/cpu/cpufreq/gx-suspmod.c4
-rw-r--r--arch/i386/kernel/cpu/cpufreq/longhaul.c128
-rw-r--r--arch/i386/kernel/cpu/cpufreq/p4-clockmod.c38
-rw-r--r--arch/i386/kernel/cpu/cpufreq/sc520_freq.c7
-rw-r--r--arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c15
-rw-r--r--arch/i386/kernel/cpu/cpufreq/speedstep-lib.c32
-rw-r--r--arch/i386/kernel/cpu/cpufreq/speedstep-lib.h1
-rw-r--r--arch/i386/kernel/cpu/cpufreq/speedstep-smi.c3
-rw-r--r--arch/i386/kernel/cpu/cyrix.c2
-rw-r--r--arch/i386/kernel/cpu/intel.c12
-rw-r--r--arch/i386/kernel/cpu/intel_cacheinfo.c11
-rw-r--r--arch/i386/kernel/cpu/mcheck/non-fatal.c6
-rw-r--r--arch/i386/kernel/cpu/mcheck/therm_throt.c3
-rw-r--r--arch/i386/kernel/cpu/mtrr/Makefile4
-rw-r--r--arch/i386/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/centaur.c9
-rw-r--r--arch/i386/kernel/cpu/mtrr/cyrix.c25
-rw-r--r--arch/i386/kernel/cpu/mtrr/generic.c78
-rw-r--r--arch/i386/kernel/cpu/mtrr/if.c31
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c71
-rw-r--r--arch/i386/kernel/cpu/mtrr/mtrr.h25
-rw-r--r--arch/i386/kernel/cpu/proc.c3
-rw-r--r--arch/i386/kernel/cpuid.c27
-rw-r--r--arch/i386/kernel/crash.c66
-rw-r--r--arch/i386/kernel/e820.c894
-rw-r--r--arch/i386/kernel/efi.c17
-rw-r--r--arch/i386/kernel/entry.S329
-rw-r--r--arch/i386/kernel/head.S66
-rw-r--r--arch/i386/kernel/hpet.c7
-rw-r--r--arch/i386/kernel/i8259.c5
-rw-r--r--arch/i386/kernel/io_apic.c68
-rw-r--r--arch/i386/kernel/kprobes.c4
-rw-r--r--arch/i386/kernel/ldt.c4
-rw-r--r--arch/i386/kernel/mca.c13
-rw-r--r--arch/i386/kernel/microcode.c10
-rw-r--r--arch/i386/kernel/module.c15
-rw-r--r--arch/i386/kernel/mpparse.c10
-rw-r--r--arch/i386/kernel/msr.c31
-rw-r--r--arch/i386/kernel/nmi.c50
-rw-r--r--arch/i386/kernel/paravirt.c569
-rw-r--r--arch/i386/kernel/pci-dma.c10
-rw-r--r--arch/i386/kernel/process.c88
-rw-r--r--arch/i386/kernel/ptrace.c39
-rw-r--r--arch/i386/kernel/quirks.c69
-rw-r--r--arch/i386/kernel/reboot.c1
-rw-r--r--arch/i386/kernel/setup.c859
-rw-r--r--arch/i386/kernel/signal.c6
-rw-r--r--arch/i386/kernel/smp.c10
-rw-r--r--arch/i386/kernel/smpboot.c98
-rw-r--r--arch/i386/kernel/sysenter.c6
-rw-r--r--arch/i386/kernel/time.c15
-rw-r--r--arch/i386/kernel/time_hpet.c15
-rw-r--r--arch/i386/kernel/topology.c8
-rw-r--r--arch/i386/kernel/trampoline.S5
-rw-r--r--arch/i386/kernel/traps.c204
-rw-r--r--arch/i386/kernel/tsc.c7
-rw-r--r--arch/i386/kernel/vm86.c121
-rw-r--r--arch/i386/kernel/vmlinux.lds.S150
71 files changed, 3469 insertions, 2222 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 1a884b6e6e5c..1e8988e558c5 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -6,7 +6,7 @@ extra-y := head.o init_task.o vmlinux.lds
6 6
7obj-y := process.o signal.o entry.o traps.o irq.o \ 7obj-y := process.o signal.o entry.o traps.o irq.o \
8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ 8 ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
9 pci-dma.o i386_ksyms.o i387.o bootflag.o \ 9 pci-dma.o i386_ksyms.o i387.o bootflag.o e820.o\
10 quirks.o i8237.o topology.o alternative.o i8253.o tsc.o 10 quirks.o i8237.o topology.o alternative.o i8253.o tsc.o
11 11
12obj-$(CONFIG_STACKTRACE) += stacktrace.o 12obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -40,6 +40,9 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
40obj-$(CONFIG_HPET_TIMER) += hpet.o 40obj-$(CONFIG_HPET_TIMER) += hpet.o
41obj-$(CONFIG_K8_NB) += k8.o 41obj-$(CONFIG_K8_NB) += k8.o
42 42
43# Make sure this is linked after any other paravirt_ops structs: see head.S
44obj-$(CONFIG_PARAVIRT) += paravirt.o
45
43EXTRA_AFLAGS := -traditional 46EXTRA_AFLAGS := -traditional
44 47
45obj-$(CONFIG_SCx200) += scx200.o 48obj-$(CONFIG_SCx200) += scx200.o
diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index d12fb97a5337..cbcb2c27f48b 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -333,7 +333,7 @@ acpi_parse_ioapic(acpi_table_entry_header * header, const unsigned long end)
333/* 333/*
334 * Parse Interrupt Source Override for the ACPI SCI 334 * Parse Interrupt Source Override for the ACPI SCI
335 */ 335 */
336static void acpi_sci_ioapic_setup(u32 bus_irq, u32 gsi, u16 polarity, u16 trigger) 336static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
337{ 337{
338 if (trigger == 0) /* compatible SCI trigger is level */ 338 if (trigger == 0) /* compatible SCI trigger is level */
339 trigger = 3; 339 trigger = 3;
@@ -353,13 +353,13 @@ static void acpi_sci_ioapic_setup(u32 bus_irq, u32 gsi, u16 polarity, u16 trigge
353 * If GSI is < 16, this will update its flags, 353 * If GSI is < 16, this will update its flags,
354 * else it will create a new mp_irqs[] entry. 354 * else it will create a new mp_irqs[] entry.
355 */ 355 */
356 mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); 356 mp_override_legacy_irq(gsi, polarity, trigger, gsi);
357 357
358 /* 358 /*
359 * stash over-ride to indicate we've been here 359 * stash over-ride to indicate we've been here
360 * and for later update of acpi_fadt 360 * and for later update of acpi_fadt
361 */ 361 */
362 acpi_sci_override_gsi = bus_irq; 362 acpi_sci_override_gsi = gsi;
363 return; 363 return;
364} 364}
365 365
@@ -377,7 +377,7 @@ acpi_parse_int_src_ovr(acpi_table_entry_header * header,
377 acpi_table_print_madt_entry(header); 377 acpi_table_print_madt_entry(header);
378 378
379 if (intsrc->bus_irq == acpi_fadt.sci_int) { 379 if (intsrc->bus_irq == acpi_fadt.sci_int) {
380 acpi_sci_ioapic_setup(intsrc->bus_irq, intsrc->global_irq, 380 acpi_sci_ioapic_setup(intsrc->global_irq,
381 intsrc->flags.polarity, 381 intsrc->flags.polarity,
382 intsrc->flags.trigger); 382 intsrc->flags.trigger);
383 return 0; 383 return 0;
@@ -880,7 +880,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
880 * pretend we got one so we can set the SCI flags. 880 * pretend we got one so we can set the SCI flags.
881 */ 881 */
882 if (!acpi_sci_override_gsi) 882 if (!acpi_sci_override_gsi)
883 acpi_sci_ioapic_setup(acpi_fadt.sci_int, acpi_fadt.sci_int, 0, 0); 883 acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0);
884 884
885 /* Fill in identity legacy mapings where no override */ 885 /* Fill in identity legacy mapings where no override */
886 mp_config_acpi_legacy_irqs(); 886 mp_config_acpi_legacy_irqs();
@@ -1327,3 +1327,25 @@ static int __init setup_acpi_sci(char *s)
1327 return 0; 1327 return 0;
1328} 1328}
1329early_param("acpi_sci", setup_acpi_sci); 1329early_param("acpi_sci", setup_acpi_sci);
1330
1331int __acpi_acquire_global_lock(unsigned int *lock)
1332{
1333 unsigned int old, new, val;
1334 do {
1335 old = *lock;
1336 new = (((old & ~0x3) + 2) + ((old >> 1) & 0x1));
1337 val = cmpxchg(lock, old, new);
1338 } while (unlikely (val != old));
1339 return (new < 3) ? -1 : 0;
1340}
1341
1342int __acpi_release_global_lock(unsigned int *lock)
1343{
1344 unsigned int old, new, val;
1345 do {
1346 old = *lock;
1347 new = old & ~0x3;
1348 val = cmpxchg(lock, old, new);
1349 } while (unlikely (val != old));
1350 return old & 0x1;
1351}
diff --git a/arch/i386/kernel/acpi/cstate.c b/arch/i386/kernel/acpi/cstate.c
index 20563e52c622..2d39f55d29a8 100644
--- a/arch/i386/kernel/acpi/cstate.c
+++ b/arch/i386/kernel/acpi/cstate.c
@@ -11,6 +11,7 @@
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/acpi.h> 12#include <linux/acpi.h>
13#include <linux/cpu.h> 13#include <linux/cpu.h>
14#include <linux/sched.h>
14 15
15#include <acpi/processor.h> 16#include <acpi/processor.h>
16#include <asm/acpi.h> 17#include <asm/acpi.h>
@@ -46,13 +47,13 @@ EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
46 47
47/* The code below handles cstate entry with monitor-mwait pair on Intel*/ 48/* The code below handles cstate entry with monitor-mwait pair on Intel*/
48 49
49struct cstate_entry_s { 50struct cstate_entry {
50 struct { 51 struct {
51 unsigned int eax; 52 unsigned int eax;
52 unsigned int ecx; 53 unsigned int ecx;
53 } states[ACPI_PROCESSOR_MAX_POWER]; 54 } states[ACPI_PROCESSOR_MAX_POWER];
54}; 55};
55static struct cstate_entry_s *cpu_cstate_entry; /* per CPU ptr */ 56static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */
56 57
57static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; 58static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
58 59
@@ -70,7 +71,7 @@ static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
70int acpi_processor_ffh_cstate_probe(unsigned int cpu, 71int acpi_processor_ffh_cstate_probe(unsigned int cpu,
71 struct acpi_processor_cx *cx, struct acpi_power_register *reg) 72 struct acpi_processor_cx *cx, struct acpi_power_register *reg)
72{ 73{
73 struct cstate_entry_s *percpu_entry; 74 struct cstate_entry *percpu_entry;
74 struct cpuinfo_x86 *c = cpu_data + cpu; 75 struct cpuinfo_x86 *c = cpu_data + cpu;
75 76
76 cpumask_t saved_mask; 77 cpumask_t saved_mask;
@@ -135,7 +136,7 @@ EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
135void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) 136void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
136{ 137{
137 unsigned int cpu = smp_processor_id(); 138 unsigned int cpu = smp_processor_id();
138 struct cstate_entry_s *percpu_entry; 139 struct cstate_entry *percpu_entry;
139 140
140 percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); 141 percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu);
141 mwait_idle_with_hints(percpu_entry->states[cx->index].eax, 142 mwait_idle_with_hints(percpu_entry->states[cx->index].eax,
@@ -149,16 +150,14 @@ static int __init ffh_cstate_init(void)
149 if (c->x86_vendor != X86_VENDOR_INTEL) 150 if (c->x86_vendor != X86_VENDOR_INTEL)
150 return -1; 151 return -1;
151 152
152 cpu_cstate_entry = alloc_percpu(struct cstate_entry_s); 153 cpu_cstate_entry = alloc_percpu(struct cstate_entry);
153 return 0; 154 return 0;
154} 155}
155 156
156static void __exit ffh_cstate_exit(void) 157static void __exit ffh_cstate_exit(void)
157{ 158{
158 if (cpu_cstate_entry) { 159 free_percpu(cpu_cstate_entry);
159 free_percpu(cpu_cstate_entry); 160 cpu_cstate_entry = NULL;
160 cpu_cstate_entry = NULL;
161 }
162} 161}
163 162
164arch_initcall(ffh_cstate_init); 163arch_initcall(ffh_cstate_init);
diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c
index c9841692bb7c..4b60af7f91dd 100644
--- a/arch/i386/kernel/acpi/earlyquirk.c
+++ b/arch/i386/kernel/acpi/earlyquirk.c
@@ -10,6 +10,7 @@
10#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
11#include <asm/acpi.h> 11#include <asm/acpi.h>
12#include <asm/apic.h> 12#include <asm/apic.h>
13#include <asm/irq.h>
13 14
14#ifdef CONFIG_ACPI 15#ifdef CONFIG_ACPI
15 16
@@ -49,6 +50,24 @@ static int __init check_bridge(int vendor, int device)
49 return 0; 50 return 0;
50} 51}
51 52
53static void check_intel(void)
54{
55 u16 vendor, device;
56
57 vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID);
58
59 if (vendor != PCI_VENDOR_ID_INTEL)
60 return;
61
62 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID);
63#ifdef CONFIG_SMP
64 if (device == PCI_DEVICE_ID_INTEL_E7320_MCH ||
65 device == PCI_DEVICE_ID_INTEL_E7520_MCH ||
66 device == PCI_DEVICE_ID_INTEL_E7525_MCH)
67 quirk_intel_irqbalance();
68#endif
69}
70
52void __init check_acpi_pci(void) 71void __init check_acpi_pci(void)
53{ 72{
54 int num, slot, func; 73 int num, slot, func;
@@ -60,6 +79,8 @@ void __init check_acpi_pci(void)
60 if (!early_pci_allowed()) 79 if (!early_pci_allowed())
61 return; 80 return;
62 81
82 check_intel();
83
63 /* Poor man's PCI discovery */ 84 /* Poor man's PCI discovery */
64 for (num = 0; num < 32; num++) { 85 for (num = 0; num < 32; num++) {
65 for (slot = 0; slot < 32; slot++) { 86 for (slot = 0; slot < 32; slot++) {
diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c
index 583c238e17fb..9eca21b49f6b 100644
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -1,4 +1,5 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/sched.h>
2#include <linux/spinlock.h> 3#include <linux/spinlock.h>
3#include <linux/list.h> 4#include <linux/list.h>
4#include <asm/alternative.h> 5#include <asm/alternative.h>
@@ -123,6 +124,20 @@ static unsigned char** find_nop_table(void)
123 124
124#endif /* CONFIG_X86_64 */ 125#endif /* CONFIG_X86_64 */
125 126
127static void nop_out(void *insns, unsigned int len)
128{
129 unsigned char **noptable = find_nop_table();
130
131 while (len > 0) {
132 unsigned int noplen = len;
133 if (noplen > ASM_NOP_MAX)
134 noplen = ASM_NOP_MAX;
135 memcpy(insns, noptable[noplen], noplen);
136 insns += noplen;
137 len -= noplen;
138 }
139}
140
126extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 141extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
127extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[]; 142extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
128extern u8 *__smp_locks[], *__smp_locks_end[]; 143extern u8 *__smp_locks[], *__smp_locks_end[];
@@ -137,10 +152,9 @@ extern u8 __smp_alt_begin[], __smp_alt_end[];
137 152
138void apply_alternatives(struct alt_instr *start, struct alt_instr *end) 153void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
139{ 154{
140 unsigned char **noptable = find_nop_table();
141 struct alt_instr *a; 155 struct alt_instr *a;
142 u8 *instr; 156 u8 *instr;
143 int diff, i, k; 157 int diff;
144 158
145 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); 159 DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
146 for (a = start; a < end; a++) { 160 for (a = start; a < end; a++) {
@@ -158,13 +172,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
158#endif 172#endif
159 memcpy(instr, a->replacement, a->replacementlen); 173 memcpy(instr, a->replacement, a->replacementlen);
160 diff = a->instrlen - a->replacementlen; 174 diff = a->instrlen - a->replacementlen;
161 /* Pad the rest with nops */ 175 nop_out(instr + a->replacementlen, diff);
162 for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
163 k = diff;
164 if (k > ASM_NOP_MAX)
165 k = ASM_NOP_MAX;
166 memcpy(a->instr + i, noptable[k], k);
167 }
168 } 176 }
169} 177}
170 178
@@ -208,7 +216,6 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
208 216
209static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 217static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
210{ 218{
211 unsigned char **noptable = find_nop_table();
212 u8 **ptr; 219 u8 **ptr;
213 220
214 for (ptr = start; ptr < end; ptr++) { 221 for (ptr = start; ptr < end; ptr++) {
@@ -216,7 +223,7 @@ static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end
216 continue; 223 continue;
217 if (*ptr > text_end) 224 if (*ptr > text_end)
218 continue; 225 continue;
219 **ptr = noptable[1][0]; 226 nop_out(*ptr, 1);
220 }; 227 };
221} 228}
222 229
@@ -342,6 +349,40 @@ void alternatives_smp_switch(int smp)
342 349
343#endif 350#endif
344 351
352#ifdef CONFIG_PARAVIRT
353void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
354{
355 struct paravirt_patch *p;
356
357 for (p = start; p < end; p++) {
358 unsigned int used;
359
360 used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
361 p->len);
362#ifdef CONFIG_DEBUG_PARAVIRT
363 {
364 int i;
365 /* Deliberately clobber regs using "not %reg" to find bugs. */
366 for (i = 0; i < 3; i++) {
367 if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
368 memcpy(p->instr + used, "\xf7\xd0", 2);
369 p->instr[used+1] |= i;
370 used += 2;
371 }
372 }
373 }
374#endif
375 /* Pad the rest with nops */
376 nop_out(p->instr + used, p->len - used);
377 }
378
379 /* Sync to be conservative, in case we patched following instructions */
380 sync_core();
381}
382extern struct paravirt_patch __start_parainstructions[],
383 __stop_parainstructions[];
384#endif /* CONFIG_PARAVIRT */
385
345void __init alternative_instructions(void) 386void __init alternative_instructions(void)
346{ 387{
347 unsigned long flags; 388 unsigned long flags;
@@ -389,5 +430,6 @@ void __init alternative_instructions(void)
389 alternatives_smp_switch(0); 430 alternatives_smp_switch(0);
390 } 431 }
391#endif 432#endif
433 apply_paravirt(__start_parainstructions, __stop_parainstructions);
392 local_irq_restore(flags); 434 local_irq_restore(flags);
393} 435}
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 2fd4b7d927c2..776d9be26af9 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -647,23 +647,30 @@ static struct {
647static int lapic_suspend(struct sys_device *dev, pm_message_t state) 647static int lapic_suspend(struct sys_device *dev, pm_message_t state)
648{ 648{
649 unsigned long flags; 649 unsigned long flags;
650 int maxlvt;
650 651
651 if (!apic_pm_state.active) 652 if (!apic_pm_state.active)
652 return 0; 653 return 0;
653 654
655 maxlvt = get_maxlvt();
656
654 apic_pm_state.apic_id = apic_read(APIC_ID); 657 apic_pm_state.apic_id = apic_read(APIC_ID);
655 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); 658 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
656 apic_pm_state.apic_ldr = apic_read(APIC_LDR); 659 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
657 apic_pm_state.apic_dfr = apic_read(APIC_DFR); 660 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
658 apic_pm_state.apic_spiv = apic_read(APIC_SPIV); 661 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
659 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); 662 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
660 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); 663 if (maxlvt >= 4)
664 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
661 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); 665 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
662 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); 666 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
663 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 667 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
664 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 668 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
665 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 669 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
666 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 670#ifdef CONFIG_X86_MCE_P4THERMAL
671 if (maxlvt >= 5)
672 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
673#endif
667 674
668 local_irq_save(flags); 675 local_irq_save(flags);
669 disable_local_APIC(); 676 disable_local_APIC();
@@ -675,10 +682,13 @@ static int lapic_resume(struct sys_device *dev)
675{ 682{
676 unsigned int l, h; 683 unsigned int l, h;
677 unsigned long flags; 684 unsigned long flags;
685 int maxlvt;
678 686
679 if (!apic_pm_state.active) 687 if (!apic_pm_state.active)
680 return 0; 688 return 0;
681 689
690 maxlvt = get_maxlvt();
691
682 local_irq_save(flags); 692 local_irq_save(flags);
683 693
684 /* 694 /*
@@ -700,8 +710,12 @@ static int lapic_resume(struct sys_device *dev)
700 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 710 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
701 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 711 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
702 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 712 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
703 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 713#ifdef CONFIG_X86_MCE_P4THERMAL
704 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); 714 if (maxlvt >= 5)
715 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
716#endif
717 if (maxlvt >= 4)
718 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
705 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); 719 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
706 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); 720 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
707 apic_write(APIC_TMICT, apic_pm_state.apic_tmict); 721 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index a60358fe9a49..199016927541 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -231,6 +231,7 @@
231#include <asm/uaccess.h> 231#include <asm/uaccess.h>
232#include <asm/desc.h> 232#include <asm/desc.h>
233#include <asm/i8253.h> 233#include <asm/i8253.h>
234#include <asm/paravirt.h>
234 235
235#include "io_ports.h" 236#include "io_ports.h"
236 237
@@ -784,7 +785,11 @@ static int apm_do_idle(void)
784 polling = !!(current_thread_info()->status & TS_POLLING); 785 polling = !!(current_thread_info()->status & TS_POLLING);
785 if (polling) { 786 if (polling) {
786 current_thread_info()->status &= ~TS_POLLING; 787 current_thread_info()->status &= ~TS_POLLING;
787 smp_mb__after_clear_bit(); 788 /*
789 * TS_POLLING-cleared state must be visible before we
790 * test NEED_RESCHED:
791 */
792 smp_mb();
788 } 793 }
789 if (!need_resched()) { 794 if (!need_resched()) {
790 idled = 1; 795 idled = 1;
@@ -1603,7 +1608,7 @@ static int do_open(struct inode * inode, struct file * filp)
1603{ 1608{
1604 struct apm_user * as; 1609 struct apm_user * as;
1605 1610
1606 as = (struct apm_user *)kmalloc(sizeof(*as), GFP_KERNEL); 1611 as = kmalloc(sizeof(*as), GFP_KERNEL);
1607 if (as == NULL) { 1612 if (as == NULL) {
1608 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", 1613 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1609 sizeof(*as)); 1614 sizeof(*as));
@@ -2235,7 +2240,7 @@ static int __init apm_init(void)
2235 2240
2236 dmi_check_system(apm_dmi_table); 2241 dmi_check_system(apm_dmi_table);
2237 2242
2238 if (apm_info.bios.version == 0) { 2243 if (apm_info.bios.version == 0 || paravirt_enabled()) {
2239 printk(KERN_INFO "apm: BIOS not found.\n"); 2244 printk(KERN_INFO "apm: BIOS not found.\n");
2240 return -ENODEV; 2245 return -ENODEV;
2241 } 2246 }
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index c80271f8f084..1b2f3cd33270 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -15,6 +15,7 @@
15#include <asm/processor.h> 15#include <asm/processor.h>
16#include <asm/thread_info.h> 16#include <asm/thread_info.h>
17#include <asm/elf.h> 17#include <asm/elf.h>
18#include <asm/pda.h>
18 19
19#define DEFINE(sym, val) \ 20#define DEFINE(sym, val) \
20 asm volatile("\n->" #sym " %0 " #val : : "i" (val)) 21 asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -51,13 +52,35 @@ void foo(void)
51 OFFSET(TI_exec_domain, thread_info, exec_domain); 52 OFFSET(TI_exec_domain, thread_info, exec_domain);
52 OFFSET(TI_flags, thread_info, flags); 53 OFFSET(TI_flags, thread_info, flags);
53 OFFSET(TI_status, thread_info, status); 54 OFFSET(TI_status, thread_info, status);
54 OFFSET(TI_cpu, thread_info, cpu);
55 OFFSET(TI_preempt_count, thread_info, preempt_count); 55 OFFSET(TI_preempt_count, thread_info, preempt_count);
56 OFFSET(TI_addr_limit, thread_info, addr_limit); 56 OFFSET(TI_addr_limit, thread_info, addr_limit);
57 OFFSET(TI_restart_block, thread_info, restart_block); 57 OFFSET(TI_restart_block, thread_info, restart_block);
58 OFFSET(TI_sysenter_return, thread_info, sysenter_return); 58 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
59 BLANK(); 59 BLANK();
60 60
61 OFFSET(GDS_size, Xgt_desc_struct, size);
62 OFFSET(GDS_address, Xgt_desc_struct, address);
63 OFFSET(GDS_pad, Xgt_desc_struct, pad);
64 BLANK();
65
66 OFFSET(PT_EBX, pt_regs, ebx);
67 OFFSET(PT_ECX, pt_regs, ecx);
68 OFFSET(PT_EDX, pt_regs, edx);
69 OFFSET(PT_ESI, pt_regs, esi);
70 OFFSET(PT_EDI, pt_regs, edi);
71 OFFSET(PT_EBP, pt_regs, ebp);
72 OFFSET(PT_EAX, pt_regs, eax);
73 OFFSET(PT_DS, pt_regs, xds);
74 OFFSET(PT_ES, pt_regs, xes);
75 OFFSET(PT_GS, pt_regs, xgs);
76 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
77 OFFSET(PT_EIP, pt_regs, eip);
78 OFFSET(PT_CS, pt_regs, xcs);
79 OFFSET(PT_EFLAGS, pt_regs, eflags);
80 OFFSET(PT_OLDESP, pt_regs, esp);
81 OFFSET(PT_OLDSS, pt_regs, xss);
82 BLANK();
83
61 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); 84 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
62 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 85 OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
63 BLANK(); 86 BLANK();
@@ -74,4 +97,18 @@ void foo(void)
74 DEFINE(VDSO_PRELINK, VDSO_PRELINK); 97 DEFINE(VDSO_PRELINK, VDSO_PRELINK);
75 98
76 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); 99 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
100
101 BLANK();
102 OFFSET(PDA_cpu, i386_pda, cpu_number);
103 OFFSET(PDA_pcurrent, i386_pda, pcurrent);
104
105#ifdef CONFIG_PARAVIRT
106 BLANK();
107 OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
108 OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
109 OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
110 OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
111 OFFSET(PARAVIRT_iret, paravirt_ops, iret);
112 OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
113#endif
77} 114}
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index e4758095d87a..41cfea57232b 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -104,10 +104,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
104 f_vide(); 104 f_vide();
105 rdtscl(d2); 105 rdtscl(d2);
106 d = d2-d; 106 d = d2-d;
107 107
108 /* Knock these two lines out if it debugs out ok */
109 printk(KERN_INFO "AMD K6 stepping B detected - ");
110 /* -- cut here -- */
111 if (d > 20*K6_BUG_LOOP) 108 if (d > 20*K6_BUG_LOOP)
112 printk("system stability may be impaired when more than 32 MB are used.\n"); 109 printk("system stability may be impaired when more than 32 MB are used.\n");
113 else 110 else
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index d9f3e3c31f05..8689d62abd4a 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -18,14 +18,15 @@
18#include <asm/apic.h> 18#include <asm/apic.h>
19#include <mach_apic.h> 19#include <mach_apic.h>
20#endif 20#endif
21#include <asm/pda.h>
21 22
22#include "cpu.h" 23#include "cpu.h"
23 24
24DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); 25DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
25EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); 26EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
26 27
27DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); 28struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
28EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); 29EXPORT_SYMBOL(_cpu_pda);
29 30
30static int cachesize_override __cpuinitdata = -1; 31static int cachesize_override __cpuinitdata = -1;
31static int disable_x86_fxsr __cpuinitdata; 32static int disable_x86_fxsr __cpuinitdata;
@@ -53,7 +54,7 @@ static struct cpu_dev __cpuinitdata default_cpu = {
53 .c_init = default_init, 54 .c_init = default_init,
54 .c_vendor = "Unknown", 55 .c_vendor = "Unknown",
55}; 56};
56static struct cpu_dev * this_cpu = &default_cpu; 57static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu;
57 58
58static int __init cachesize_setup(char *str) 59static int __init cachesize_setup(char *str)
59{ 60{
@@ -235,29 +236,14 @@ static int __cpuinit have_cpuid_p(void)
235 return flag_is_changeable_p(X86_EFLAGS_ID); 236 return flag_is_changeable_p(X86_EFLAGS_ID);
236} 237}
237 238
238/* Do minimum CPU detection early. 239void __init cpu_detect(struct cpuinfo_x86 *c)
239 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
240 The others are not touched to avoid unwanted side effects.
241
242 WARNING: this function is only called on the BP. Don't add code here
243 that is supposed to run on all CPUs. */
244static void __init early_cpu_detect(void)
245{ 240{
246 struct cpuinfo_x86 *c = &boot_cpu_data;
247
248 c->x86_cache_alignment = 32;
249
250 if (!have_cpuid_p())
251 return;
252
253 /* Get vendor name */ 241 /* Get vendor name */
254 cpuid(0x00000000, &c->cpuid_level, 242 cpuid(0x00000000, &c->cpuid_level,
255 (int *)&c->x86_vendor_id[0], 243 (int *)&c->x86_vendor_id[0],
256 (int *)&c->x86_vendor_id[8], 244 (int *)&c->x86_vendor_id[8],
257 (int *)&c->x86_vendor_id[4]); 245 (int *)&c->x86_vendor_id[4]);
258 246
259 get_cpu_vendor(c, 1);
260
261 c->x86 = 4; 247 c->x86 = 4;
262 if (c->cpuid_level >= 0x00000001) { 248 if (c->cpuid_level >= 0x00000001) {
263 u32 junk, tfms, cap0, misc; 249 u32 junk, tfms, cap0, misc;
@@ -274,6 +260,26 @@ static void __init early_cpu_detect(void)
274 } 260 }
275} 261}
276 262
263/* Do minimum CPU detection early.
264 Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
265 The others are not touched to avoid unwanted side effects.
266
267 WARNING: this function is only called on the BP. Don't add code here
268 that is supposed to run on all CPUs. */
269static void __init early_cpu_detect(void)
270{
271 struct cpuinfo_x86 *c = &boot_cpu_data;
272
273 c->x86_cache_alignment = 32;
274
275 if (!have_cpuid_p())
276 return;
277
278 cpu_detect(c);
279
280 get_cpu_vendor(c, 1);
281}
282
277static void __cpuinit generic_identify(struct cpuinfo_x86 * c) 283static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
278{ 284{
279 u32 tfms, xlvl; 285 u32 tfms, xlvl;
@@ -308,6 +314,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
308#else 314#else
309 c->apicid = (ebx >> 24) & 0xFF; 315 c->apicid = (ebx >> 24) & 0xFF;
310#endif 316#endif
317 if (c->x86_capability[0] & (1<<19))
318 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
311 } else { 319 } else {
312 /* Have CPUID level 0 only - unheard of */ 320 /* Have CPUID level 0 only - unheard of */
313 c->x86 = 4; 321 c->x86 = 4;
@@ -372,6 +380,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
372 c->x86_vendor_id[0] = '\0'; /* Unset */ 380 c->x86_vendor_id[0] = '\0'; /* Unset */
373 c->x86_model_id[0] = '\0'; /* Unset */ 381 c->x86_model_id[0] = '\0'; /* Unset */
374 c->x86_max_cores = 1; 382 c->x86_max_cores = 1;
383 c->x86_clflush_size = 32;
375 memset(&c->x86_capability, 0, sizeof c->x86_capability); 384 memset(&c->x86_capability, 0, sizeof c->x86_capability);
376 385
377 if (!have_cpuid_p()) { 386 if (!have_cpuid_p()) {
@@ -591,42 +600,24 @@ void __init early_cpu_init(void)
591 disable_pse = 1; 600 disable_pse = 1;
592#endif 601#endif
593} 602}
594/* 603
595 * cpu_init() initializes state that is per-CPU. Some data is already 604/* Make sure %gs is initialized properly in idle threads */
596 * initialized (naturally) in the bootstrap process, such as the GDT 605struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
597 * and IDT. We reload them nevertheless, this function acts as a
598 * 'CPU state barrier', nothing should get across.
599 */
600void __cpuinit cpu_init(void)
601{ 606{
602 int cpu = smp_processor_id(); 607 memset(regs, 0, sizeof(struct pt_regs));
603 struct tss_struct * t = &per_cpu(init_tss, cpu); 608 regs->xgs = __KERNEL_PDA;
604 struct thread_struct *thread = &current->thread; 609 return regs;
605 struct desc_struct *gdt; 610}
606 __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu);
607 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
608 611
609 if (cpu_test_and_set(cpu, cpu_initialized)) { 612static __cpuinit int alloc_gdt(int cpu)
610 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); 613{
611 for (;;) local_irq_enable(); 614 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
612 } 615 struct desc_struct *gdt;
613 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 616 struct i386_pda *pda;
614 617
615 if (cpu_has_vme || cpu_has_tsc || cpu_has_de) 618 gdt = (struct desc_struct *)cpu_gdt_descr->address;
616 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 619 pda = cpu_pda(cpu);
617 if (tsc_disable && cpu_has_tsc) {
618 printk(KERN_NOTICE "Disabling TSC...\n");
619 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
620 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
621 set_in_cr4(X86_CR4_TSD);
622 }
623 620
624 /* The CPU hotplug case */
625 if (cpu_gdt_descr->address) {
626 gdt = (struct desc_struct *)cpu_gdt_descr->address;
627 memset(gdt, 0, PAGE_SIZE);
628 goto old_gdt;
629 }
630 /* 621 /*
631 * This is a horrible hack to allocate the GDT. The problem 622 * This is a horrible hack to allocate the GDT. The problem
632 * is that cpu_init() is called really early for the boot CPU 623 * is that cpu_init() is called really early for the boot CPU
@@ -634,43 +625,130 @@ void __cpuinit cpu_init(void)
634 * CPUs, when bootmem will have gone away 625 * CPUs, when bootmem will have gone away
635 */ 626 */
636 if (NODE_DATA(0)->bdata->node_bootmem_map) { 627 if (NODE_DATA(0)->bdata->node_bootmem_map) {
637 gdt = (struct desc_struct *)alloc_bootmem_pages(PAGE_SIZE); 628 BUG_ON(gdt != NULL || pda != NULL);
638 /* alloc_bootmem_pages panics on failure, so no check */ 629
630 gdt = alloc_bootmem_pages(PAGE_SIZE);
631 pda = alloc_bootmem(sizeof(*pda));
632 /* alloc_bootmem(_pages) panics on failure, so no check */
633
639 memset(gdt, 0, PAGE_SIZE); 634 memset(gdt, 0, PAGE_SIZE);
635 memset(pda, 0, sizeof(*pda));
640 } else { 636 } else {
641 gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); 637 /* GDT and PDA might already have been allocated if
642 if (unlikely(!gdt)) { 638 this is a CPU hotplug re-insertion. */
643 printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu); 639 if (gdt == NULL)
644 for (;;) 640 gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
645 local_irq_enable(); 641
642 if (pda == NULL)
643 pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
644
645 if (unlikely(!gdt || !pda)) {
646 free_pages((unsigned long)gdt, 0);
647 kfree(pda);
648 return 0;
646 } 649 }
647 } 650 }
648old_gdt: 651
652 cpu_gdt_descr->address = (unsigned long)gdt;
653 cpu_pda(cpu) = pda;
654
655 return 1;
656}
657
658/* Initial PDA used by boot CPU */
659struct i386_pda boot_pda = {
660 ._pda = &boot_pda,
661 .cpu_number = 0,
662 .pcurrent = &init_task,
663};
664
665static inline void set_kernel_gs(void)
666{
667 /* Set %gs for this CPU's PDA. Memory clobber is to create a
668 barrier with respect to any PDA operations, so the compiler
669 doesn't move any before here. */
670 asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
671}
672
673/* Initialize the CPU's GDT and PDA. The boot CPU does this for
674 itself, but secondaries find this done for them. */
675__cpuinit int init_gdt(int cpu, struct task_struct *idle)
676{
677 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
678 struct desc_struct *gdt;
679 struct i386_pda *pda;
680
681 /* For non-boot CPUs, the GDT and PDA should already have been
682 allocated. */
683 if (!alloc_gdt(cpu)) {
684 printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
685 return 0;
686 }
687
688 gdt = (struct desc_struct *)cpu_gdt_descr->address;
689 pda = cpu_pda(cpu);
690
691 BUG_ON(gdt == NULL || pda == NULL);
692
649 /* 693 /*
650 * Initialize the per-CPU GDT with the boot GDT, 694 * Initialize the per-CPU GDT with the boot GDT,
651 * and set up the GDT descriptor: 695 * and set up the GDT descriptor:
652 */ 696 */
653 memcpy(gdt, cpu_gdt_table, GDT_SIZE); 697 memcpy(gdt, cpu_gdt_table, GDT_SIZE);
698 cpu_gdt_descr->size = GDT_SIZE - 1;
654 699
655 /* Set up GDT entry for 16bit stack */ 700 pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
656 *(__u64 *)(&gdt[GDT_ENTRY_ESPFIX_SS]) |= 701 (u32 *)&gdt[GDT_ENTRY_PDA].b,
657 ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) | 702 (unsigned long)pda, sizeof(*pda) - 1,
658 ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) | 703 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
659 (CPU_16BIT_STACK_SIZE - 1);
660 704
661 cpu_gdt_descr->size = GDT_SIZE - 1; 705 memset(pda, 0, sizeof(*pda));
662 cpu_gdt_descr->address = (unsigned long)gdt; 706 pda->_pda = pda;
707 pda->cpu_number = cpu;
708 pda->pcurrent = idle;
709
710 return 1;
711}
712
713/* Common CPU init for both boot and secondary CPUs */
714static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
715{
716 struct tss_struct * t = &per_cpu(init_tss, cpu);
717 struct thread_struct *thread = &curr->thread;
718 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
663 719
720 /* Reinit these anyway, even if they've already been done (on
721 the boot CPU, this will transition from the boot gdt+pda to
722 the real ones). */
664 load_gdt(cpu_gdt_descr); 723 load_gdt(cpu_gdt_descr);
724 set_kernel_gs();
725
726 if (cpu_test_and_set(cpu, cpu_initialized)) {
727 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
728 for (;;) local_irq_enable();
729 }
730
731 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
732
733 if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
734 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
735 if (tsc_disable && cpu_has_tsc) {
736 printk(KERN_NOTICE "Disabling TSC...\n");
737 /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
738 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
739 set_in_cr4(X86_CR4_TSD);
740 }
741
665 load_idt(&idt_descr); 742 load_idt(&idt_descr);
666 743
667 /* 744 /*
668 * Set up and load the per-CPU TSS and LDT 745 * Set up and load the per-CPU TSS and LDT
669 */ 746 */
670 atomic_inc(&init_mm.mm_count); 747 atomic_inc(&init_mm.mm_count);
671 current->active_mm = &init_mm; 748 curr->active_mm = &init_mm;
672 BUG_ON(current->mm); 749 if (curr->mm)
673 enter_lazy_tlb(&init_mm, current); 750 BUG();
751 enter_lazy_tlb(&init_mm, curr);
674 752
675 load_esp0(t, thread); 753 load_esp0(t, thread);
676 set_tss_desc(cpu,t); 754 set_tss_desc(cpu,t);
@@ -682,8 +760,8 @@ old_gdt:
682 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); 760 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
683#endif 761#endif
684 762
685 /* Clear %fs and %gs. */ 763 /* Clear %fs. */
686 asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); 764 asm volatile ("mov %0, %%fs" : : "r" (0));
687 765
688 /* Clear all 6 debug registers: */ 766 /* Clear all 6 debug registers: */
689 set_debugreg(0, 0); 767 set_debugreg(0, 0);
@@ -701,6 +779,37 @@ old_gdt:
701 mxcsr_feature_mask_init(); 779 mxcsr_feature_mask_init();
702} 780}
703 781
782/* Entrypoint to initialize secondary CPU */
783void __cpuinit secondary_cpu_init(void)
784{
785 int cpu = smp_processor_id();
786 struct task_struct *curr = current;
787
788 _cpu_init(cpu, curr);
789}
790
791/*
792 * cpu_init() initializes state that is per-CPU. Some data is already
793 * initialized (naturally) in the bootstrap process, such as the GDT
794 * and IDT. We reload them nevertheless, this function acts as a
795 * 'CPU state barrier', nothing should get across.
796 */
797void __cpuinit cpu_init(void)
798{
799 int cpu = smp_processor_id();
800 struct task_struct *curr = current;
801
802 /* Set up the real GDT and PDA, so we can transition from the
803 boot versions. */
804 if (!init_gdt(cpu, curr)) {
805 /* failed to allocate something; not much we can do... */
806 for (;;)
807 local_irq_enable();
808 }
809
810 _cpu_init(cpu, curr);
811}
812
704#ifdef CONFIG_HOTPLUG_CPU 813#ifdef CONFIG_HOTPLUG_CPU
705void __cpuinit cpu_uninit(void) 814void __cpuinit cpu_uninit(void)
706{ 815{
diff --git a/arch/i386/kernel/cpu/cpufreq/Kconfig b/arch/i386/kernel/cpu/cpufreq/Kconfig
index ccc1edff5c97..5299c5bf4454 100644
--- a/arch/i386/kernel/cpu/cpufreq/Kconfig
+++ b/arch/i386/kernel/cpu/cpufreq/Kconfig
@@ -17,6 +17,7 @@ config X86_ACPI_CPUFREQ
17 help 17 help
18 This driver adds a CPUFreq driver which utilizes the ACPI 18 This driver adds a CPUFreq driver which utilizes the ACPI
19 Processor Performance States. 19 Processor Performance States.
20 This driver also supports Intel Enhanced Speedstep.
20 21
21 For details, take a look at <file:Documentation/cpu-freq/>. 22 For details, take a look at <file:Documentation/cpu-freq/>.
22 23
@@ -121,11 +122,14 @@ config X86_SPEEDSTEP_CENTRINO
121 If in doubt, say N. 122 If in doubt, say N.
122 123
123config X86_SPEEDSTEP_CENTRINO_ACPI 124config X86_SPEEDSTEP_CENTRINO_ACPI
124 bool "Use ACPI tables to decode valid frequency/voltage pairs" 125 bool "Use ACPI tables to decode valid frequency/voltage (deprecated)"
125 depends on X86_SPEEDSTEP_CENTRINO && ACPI_PROCESSOR 126 depends on X86_SPEEDSTEP_CENTRINO && ACPI_PROCESSOR
126 depends on !(X86_SPEEDSTEP_CENTRINO = y && ACPI_PROCESSOR = m) 127 depends on !(X86_SPEEDSTEP_CENTRINO = y && ACPI_PROCESSOR = m)
127 default y 128 default y
128 help 129 help
130 This is deprecated and this functionality is now merged into
131 acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
132 speedstep_centrino.
129 Use primarily the information provided in the BIOS ACPI tables 133 Use primarily the information provided in the BIOS ACPI tables
130 to determine valid CPU frequency and voltage pairings. It is 134 to determine valid CPU frequency and voltage pairings. It is
131 required for the driver to work on non-Banias CPUs. 135 required for the driver to work on non-Banias CPUs.
diff --git a/arch/i386/kernel/cpu/cpufreq/Makefile b/arch/i386/kernel/cpu/cpufreq/Makefile
index 2e894f1c8910..8de3abe322a9 100644
--- a/arch/i386/kernel/cpu/cpufreq/Makefile
+++ b/arch/i386/kernel/cpu/cpufreq/Makefile
@@ -7,9 +7,9 @@ obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o
7obj-$(CONFIG_X86_LONGRUN) += longrun.o 7obj-$(CONFIG_X86_LONGRUN) += longrun.o
8obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o 8obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o
9obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o 9obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o
10obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
11obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o 10obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
12obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o 11obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o
13obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 12obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
13obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
14obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o 14obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
15obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o 15obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o
diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
index 57c880bf0bd6..10baa3501ed3 100644
--- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -1,9 +1,10 @@
1/* 1/*
2 * acpi-cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.3 $) 2 * acpi-cpufreq.c - ACPI Processor P-States Driver ($Revision: 1.4 $)
3 * 3 *
4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> 4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> 5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6 * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de> 6 * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
7 * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com>
7 * 8 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 9 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 * 10 *
@@ -27,202 +28,370 @@
27#include <linux/kernel.h> 28#include <linux/kernel.h>
28#include <linux/module.h> 29#include <linux/module.h>
29#include <linux/init.h> 30#include <linux/init.h>
31#include <linux/smp.h>
32#include <linux/sched.h>
30#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
31#include <linux/proc_fs.h>
32#include <linux/seq_file.h>
33#include <linux/compiler.h> 34#include <linux/compiler.h>
34#include <linux/sched.h> /* current */
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <asm/io.h>
37#include <asm/delay.h>
38#include <asm/uaccess.h>
39 36
40#include <linux/acpi.h> 37#include <linux/acpi.h>
41#include <acpi/processor.h> 38#include <acpi/processor.h>
42 39
40#include <asm/io.h>
41#include <asm/msr.h>
42#include <asm/processor.h>
43#include <asm/cpufeature.h>
44#include <asm/delay.h>
45#include <asm/uaccess.h>
46
43#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg) 47#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg)
44 48
45MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski"); 49MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
46MODULE_DESCRIPTION("ACPI Processor P-States Driver"); 50MODULE_DESCRIPTION("ACPI Processor P-States Driver");
47MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
48 52
53enum {
54 UNDEFINED_CAPABLE = 0,
55 SYSTEM_INTEL_MSR_CAPABLE,
56 SYSTEM_IO_CAPABLE,
57};
58
59#define INTEL_MSR_RANGE (0xffff)
60#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
49 61
50struct cpufreq_acpi_io { 62struct acpi_cpufreq_data {
51 struct acpi_processor_performance *acpi_data; 63 struct acpi_processor_performance *acpi_data;
52 struct cpufreq_frequency_table *freq_table; 64 struct cpufreq_frequency_table *freq_table;
53 unsigned int resume; 65 unsigned int max_freq;
66 unsigned int resume;
67 unsigned int cpu_feature;
54}; 68};
55 69
56static struct cpufreq_acpi_io *acpi_io_data[NR_CPUS]; 70static struct acpi_cpufreq_data *drv_data[NR_CPUS];
57static struct acpi_processor_performance *acpi_perf_data[NR_CPUS]; 71static struct acpi_processor_performance *acpi_perf_data[NR_CPUS];
58 72
59static struct cpufreq_driver acpi_cpufreq_driver; 73static struct cpufreq_driver acpi_cpufreq_driver;
60 74
61static unsigned int acpi_pstate_strict; 75static unsigned int acpi_pstate_strict;
62 76
63static int 77static int check_est_cpu(unsigned int cpuid)
64acpi_processor_write_port(
65 u16 port,
66 u8 bit_width,
67 u32 value)
68{ 78{
69 if (bit_width <= 8) { 79 struct cpuinfo_x86 *cpu = &cpu_data[cpuid];
70 outb(value, port); 80
71 } else if (bit_width <= 16) { 81 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
72 outw(value, port); 82 !cpu_has(cpu, X86_FEATURE_EST))
73 } else if (bit_width <= 32) { 83 return 0;
74 outl(value, port); 84
75 } else { 85 return 1;
76 return -ENODEV; 86}
87
88static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
89{
90 struct acpi_processor_performance *perf;
91 int i;
92
93 perf = data->acpi_data;
94
95 for (i=0; i<perf->state_count; i++) {
96 if (value == perf->states[i].status)
97 return data->freq_table[i].frequency;
77 } 98 }
78 return 0; 99 return 0;
79} 100}
80 101
81static int 102static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
82acpi_processor_read_port(
83 u16 port,
84 u8 bit_width,
85 u32 *ret)
86{ 103{
87 *ret = 0; 104 int i;
88 if (bit_width <= 8) { 105 struct acpi_processor_performance *perf;
89 *ret = inb(port); 106
90 } else if (bit_width <= 16) { 107 msr &= INTEL_MSR_RANGE;
91 *ret = inw(port); 108 perf = data->acpi_data;
92 } else if (bit_width <= 32) { 109
93 *ret = inl(port); 110 for (i=0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
94 } else { 111 if (msr == perf->states[data->freq_table[i].index].status)
95 return -ENODEV; 112 return data->freq_table[i].frequency;
96 } 113 }
97 return 0; 114 return data->freq_table[0].frequency;
98} 115}
99 116
100static int 117static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
101acpi_processor_set_performance (
102 struct cpufreq_acpi_io *data,
103 unsigned int cpu,
104 int state)
105{ 118{
106 u16 port = 0; 119 switch (data->cpu_feature) {
107 u8 bit_width = 0; 120 case SYSTEM_INTEL_MSR_CAPABLE:
108 int i = 0; 121 return extract_msr(val, data);
109 int ret = 0; 122 case SYSTEM_IO_CAPABLE:
110 u32 value = 0; 123 return extract_io(val, data);
111 int retval; 124 default:
112 struct acpi_processor_performance *perf; 125 return 0;
113
114 dprintk("acpi_processor_set_performance\n");
115
116 retval = 0;
117 perf = data->acpi_data;
118 if (state == perf->state) {
119 if (unlikely(data->resume)) {
120 dprintk("Called after resume, resetting to P%d\n", state);
121 data->resume = 0;
122 } else {
123 dprintk("Already at target state (P%d)\n", state);
124 return (retval);
125 }
126 } 126 }
127}
128
129struct msr_addr {
130 u32 reg;
131};
127 132
128 dprintk("Transitioning from P%d to P%d\n", perf->state, state); 133struct io_addr {
134 u16 port;
135 u8 bit_width;
136};
129 137
130 /* 138typedef union {
131 * First we write the target state's 'control' value to the 139 struct msr_addr msr;
132 * control_register. 140 struct io_addr io;
133 */ 141} drv_addr_union;
134 142
135 port = perf->control_register.address; 143struct drv_cmd {
136 bit_width = perf->control_register.bit_width; 144 unsigned int type;
137 value = (u32) perf->states[state].control; 145 cpumask_t mask;
146 drv_addr_union addr;
147 u32 val;
148};
138 149
139 dprintk("Writing 0x%08x to port 0x%04x\n", value, port); 150static void do_drv_read(struct drv_cmd *cmd)
151{
152 u32 h;
153
154 switch (cmd->type) {
155 case SYSTEM_INTEL_MSR_CAPABLE:
156 rdmsr(cmd->addr.msr.reg, cmd->val, h);
157 break;
158 case SYSTEM_IO_CAPABLE:
159 acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
160 &cmd->val,
161 (u32)cmd->addr.io.bit_width);
162 break;
163 default:
164 break;
165 }
166}
140 167
141 ret = acpi_processor_write_port(port, bit_width, value); 168static void do_drv_write(struct drv_cmd *cmd)
142 if (ret) { 169{
143 dprintk("Invalid port width 0x%04x\n", bit_width); 170 u32 h = 0;
144 return (ret); 171
172 switch (cmd->type) {
173 case SYSTEM_INTEL_MSR_CAPABLE:
174 wrmsr(cmd->addr.msr.reg, cmd->val, h);
175 break;
176 case SYSTEM_IO_CAPABLE:
177 acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
178 cmd->val,
179 (u32)cmd->addr.io.bit_width);
180 break;
181 default:
182 break;
145 } 183 }
184}
146 185
186static void drv_read(struct drv_cmd *cmd)
187{
188 cpumask_t saved_mask = current->cpus_allowed;
189 cmd->val = 0;
190
191 set_cpus_allowed(current, cmd->mask);
192 do_drv_read(cmd);
193 set_cpus_allowed(current, saved_mask);
194}
195
196static void drv_write(struct drv_cmd *cmd)
197{
198 cpumask_t saved_mask = current->cpus_allowed;
199 unsigned int i;
200
201 for_each_cpu_mask(i, cmd->mask) {
202 set_cpus_allowed(current, cpumask_of_cpu(i));
203 do_drv_write(cmd);
204 }
205
206 set_cpus_allowed(current, saved_mask);
207 return;
208}
209
210static u32 get_cur_val(cpumask_t mask)
211{
212 struct acpi_processor_performance *perf;
213 struct drv_cmd cmd;
214
215 if (unlikely(cpus_empty(mask)))
216 return 0;
217
218 switch (drv_data[first_cpu(mask)]->cpu_feature) {
219 case SYSTEM_INTEL_MSR_CAPABLE:
220 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
221 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
222 break;
223 case SYSTEM_IO_CAPABLE:
224 cmd.type = SYSTEM_IO_CAPABLE;
225 perf = drv_data[first_cpu(mask)]->acpi_data;
226 cmd.addr.io.port = perf->control_register.address;
227 cmd.addr.io.bit_width = perf->control_register.bit_width;
228 break;
229 default:
230 return 0;
231 }
232
233 cmd.mask = mask;
234
235 drv_read(&cmd);
236
237 dprintk("get_cur_val = %u\n", cmd.val);
238
239 return cmd.val;
240}
241
242/*
243 * Return the measured active (C0) frequency on this CPU since last call
244 * to this function.
245 * Input: cpu number
246 * Return: Average CPU frequency in terms of max frequency (zero on error)
247 *
248 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
249 * over a period of time, while CPU is in C0 state.
250 * IA32_MPERF counts at the rate of max advertised frequency
251 * IA32_APERF counts at the rate of actual CPU frequency
252 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
253 * no meaning should be associated with absolute values of these MSRs.
254 */
255static unsigned int get_measured_perf(unsigned int cpu)
256{
257 union {
258 struct {
259 u32 lo;
260 u32 hi;
261 } split;
262 u64 whole;
263 } aperf_cur, mperf_cur;
264
265 cpumask_t saved_mask;
266 unsigned int perf_percent;
267 unsigned int retval;
268
269 saved_mask = current->cpus_allowed;
270 set_cpus_allowed(current, cpumask_of_cpu(cpu));
271 if (get_cpu() != cpu) {
272 /* We were not able to run on requested processor */
273 put_cpu();
274 return 0;
275 }
276
277 rdmsr(MSR_IA32_APERF, aperf_cur.split.lo, aperf_cur.split.hi);
278 rdmsr(MSR_IA32_MPERF, mperf_cur.split.lo, mperf_cur.split.hi);
279
280 wrmsr(MSR_IA32_APERF, 0,0);
281 wrmsr(MSR_IA32_MPERF, 0,0);
282
283#ifdef __i386__
147 /* 284 /*
148 * Assume the write went through when acpi_pstate_strict is not used. 285 * We dont want to do 64 bit divide with 32 bit kernel
149 * As read status_register is an expensive operation and there 286 * Get an approximate value. Return failure in case we cannot get
150 * are no specific error cases where an IO port write will fail. 287 * an approximate value.
151 */ 288 */
152 if (acpi_pstate_strict) { 289 if (unlikely(aperf_cur.split.hi || mperf_cur.split.hi)) {
153 /* Then we read the 'status_register' and compare the value 290 int shift_count;
154 * with the target state's 'status' to make sure the 291 u32 h;
155 * transition was successful. 292
156 * Note that we'll poll for up to 1ms (100 cycles of 10us) 293 h = max_t(u32, aperf_cur.split.hi, mperf_cur.split.hi);
157 * before giving up. 294 shift_count = fls(h);
158 */ 295
159 296 aperf_cur.whole >>= shift_count;
160 port = perf->status_register.address; 297 mperf_cur.whole >>= shift_count;
161 bit_width = perf->status_register.bit_width; 298 }
162 299
163 dprintk("Looking for 0x%08x from port 0x%04x\n", 300 if (((unsigned long)(-1) / 100) < aperf_cur.split.lo) {
164 (u32) perf->states[state].status, port); 301 int shift_count = 7;
165 302 aperf_cur.split.lo >>= shift_count;
166 for (i = 0; i < 100; i++) { 303 mperf_cur.split.lo >>= shift_count;
167 ret = acpi_processor_read_port(port, bit_width, &value); 304 }
168 if (ret) { 305
169 dprintk("Invalid port width 0x%04x\n", bit_width); 306 if (aperf_cur.split.lo && mperf_cur.split.lo)
170 return (ret); 307 perf_percent = (aperf_cur.split.lo * 100) / mperf_cur.split.lo;
171 } 308 else
172 if (value == (u32) perf->states[state].status) 309 perf_percent = 0;
173 break; 310
174 udelay(10); 311#else
175 } 312 if (unlikely(((unsigned long)(-1) / 100) < aperf_cur.whole)) {
176 } else { 313 int shift_count = 7;
177 value = (u32) perf->states[state].status; 314 aperf_cur.whole >>= shift_count;
315 mperf_cur.whole >>= shift_count;
178 } 316 }
179 317
180 if (unlikely(value != (u32) perf->states[state].status)) { 318 if (aperf_cur.whole && mperf_cur.whole)
181 printk(KERN_WARNING "acpi-cpufreq: Transition failed\n"); 319 perf_percent = (aperf_cur.whole * 100) / mperf_cur.whole;
182 retval = -ENODEV; 320 else
183 return (retval); 321 perf_percent = 0;
322
323#endif
324
325 retval = drv_data[cpu]->max_freq * perf_percent / 100;
326
327 put_cpu();
328 set_cpus_allowed(current, saved_mask);
329
330 dprintk("cpu %d: performance percent %d\n", cpu, perf_percent);
331 return retval;
332}
333
334static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
335{
336 struct acpi_cpufreq_data *data = drv_data[cpu];
337 unsigned int freq;
338
339 dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
340
341 if (unlikely(data == NULL ||
342 data->acpi_data == NULL || data->freq_table == NULL)) {
343 return 0;
184 } 344 }
185 345
186 dprintk("Transition successful after %d microseconds\n", i * 10); 346 freq = extract_freq(get_cur_val(cpumask_of_cpu(cpu)), data);
347 dprintk("cur freq = %u\n", freq);
187 348
188 perf->state = state; 349 return freq;
189 return (retval);
190} 350}
191 351
352static unsigned int check_freqs(cpumask_t mask, unsigned int freq,
353 struct acpi_cpufreq_data *data)
354{
355 unsigned int cur_freq;
356 unsigned int i;
357
358 for (i=0; i<100; i++) {
359 cur_freq = extract_freq(get_cur_val(mask), data);
360 if (cur_freq == freq)
361 return 1;
362 udelay(10);
363 }
364 return 0;
365}
192 366
193static int 367static int acpi_cpufreq_target(struct cpufreq_policy *policy,
194acpi_cpufreq_target ( 368 unsigned int target_freq, unsigned int relation)
195 struct cpufreq_policy *policy,
196 unsigned int target_freq,
197 unsigned int relation)
198{ 369{
199 struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; 370 struct acpi_cpufreq_data *data = drv_data[policy->cpu];
200 struct acpi_processor_performance *perf; 371 struct acpi_processor_performance *perf;
201 struct cpufreq_freqs freqs; 372 struct cpufreq_freqs freqs;
202 cpumask_t online_policy_cpus; 373 cpumask_t online_policy_cpus;
203 cpumask_t saved_mask; 374 struct drv_cmd cmd;
204 cpumask_t set_mask; 375 unsigned int msr;
205 cpumask_t covered_cpus; 376 unsigned int next_state = 0; /* Index into freq_table */
206 unsigned int cur_state = 0; 377 unsigned int next_perf_state = 0; /* Index into perf table */
207 unsigned int next_state = 0; 378 unsigned int i;
208 unsigned int result = 0; 379 int result = 0;
209 unsigned int j;
210 unsigned int tmp;
211 380
212 dprintk("acpi_cpufreq_setpolicy\n"); 381 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
213 382
214 result = cpufreq_frequency_table_target(policy, 383 if (unlikely(data == NULL ||
215 data->freq_table, 384 data->acpi_data == NULL || data->freq_table == NULL)) {
216 target_freq, 385 return -ENODEV;
217 relation, 386 }
218 &next_state);
219 if (unlikely(result))
220 return (result);
221 387
222 perf = data->acpi_data; 388 perf = data->acpi_data;
223 cur_state = perf->state; 389 result = cpufreq_frequency_table_target(policy,
224 freqs.old = data->freq_table[cur_state].frequency; 390 data->freq_table,
225 freqs.new = data->freq_table[next_state].frequency; 391 target_freq,
392 relation, &next_state);
393 if (unlikely(result))
394 return -ENODEV;
226 395
227#ifdef CONFIG_HOTPLUG_CPU 396#ifdef CONFIG_HOTPLUG_CPU
228 /* cpufreq holds the hotplug lock, so we are safe from here on */ 397 /* cpufreq holds the hotplug lock, so we are safe from here on */
@@ -231,106 +400,85 @@ acpi_cpufreq_target (
231 online_policy_cpus = policy->cpus; 400 online_policy_cpus = policy->cpus;
232#endif 401#endif
233 402
234 for_each_cpu_mask(j, online_policy_cpus) { 403 next_perf_state = data->freq_table[next_state].index;
235 freqs.cpu = j; 404 if (perf->state == next_perf_state) {
236 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 405 if (unlikely(data->resume)) {
406 dprintk("Called after resume, resetting to P%d\n",
407 next_perf_state);
408 data->resume = 0;
409 } else {
410 dprintk("Already at target state (P%d)\n",
411 next_perf_state);
412 return 0;
413 }
237 } 414 }
238 415
239 /* 416 switch (data->cpu_feature) {
240 * We need to call driver->target() on all or any CPU in 417 case SYSTEM_INTEL_MSR_CAPABLE:
241 * policy->cpus, depending on policy->shared_type. 418 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
242 */ 419 cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
243 saved_mask = current->cpus_allowed; 420 msr =
244 cpus_clear(covered_cpus); 421 (u32) perf->states[next_perf_state].
245 for_each_cpu_mask(j, online_policy_cpus) { 422 control & INTEL_MSR_RANGE;
246 /* 423 cmd.val = get_cur_val(online_policy_cpus);
247 * Support for SMP systems. 424 cmd.val = (cmd.val & ~INTEL_MSR_RANGE) | msr;
248 * Make sure we are running on CPU that wants to change freq 425 break;
249 */ 426 case SYSTEM_IO_CAPABLE:
250 cpus_clear(set_mask); 427 cmd.type = SYSTEM_IO_CAPABLE;
251 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 428 cmd.addr.io.port = perf->control_register.address;
252 cpus_or(set_mask, set_mask, online_policy_cpus); 429 cmd.addr.io.bit_width = perf->control_register.bit_width;
253 else 430 cmd.val = (u32) perf->states[next_perf_state].control;
254 cpu_set(j, set_mask); 431 break;
255 432 default:
256 set_cpus_allowed(current, set_mask); 433 return -ENODEV;
257 if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { 434 }
258 dprintk("couldn't limit to CPUs in this domain\n");
259 result = -EAGAIN;
260 break;
261 }
262 435
263 result = acpi_processor_set_performance (data, j, next_state); 436 cpus_clear(cmd.mask);
264 if (result) {
265 result = -EAGAIN;
266 break;
267 }
268 437
269 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 438 if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
270 break; 439 cmd.mask = online_policy_cpus;
271 440 else
272 cpu_set(j, covered_cpus); 441 cpu_set(policy->cpu, cmd.mask);
273 }
274 442
275 for_each_cpu_mask(j, online_policy_cpus) { 443 freqs.old = perf->states[perf->state].core_frequency * 1000;
276 freqs.cpu = j; 444 freqs.new = data->freq_table[next_state].frequency;
277 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 445 for_each_cpu_mask(i, cmd.mask) {
446 freqs.cpu = i;
447 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
278 } 448 }
279 449
280 if (unlikely(result)) { 450 drv_write(&cmd);
281 /*
282 * We have failed halfway through the frequency change.
283 * We have sent callbacks to online_policy_cpus and
284 * acpi_processor_set_performance() has been called on
285 * coverd_cpus. Best effort undo..
286 */
287
288 if (!cpus_empty(covered_cpus)) {
289 for_each_cpu_mask(j, covered_cpus) {
290 policy->cpu = j;
291 acpi_processor_set_performance (data,
292 j,
293 cur_state);
294 }
295 }
296 451
297 tmp = freqs.new; 452 if (acpi_pstate_strict) {
298 freqs.new = freqs.old; 453 if (!check_freqs(cmd.mask, freqs.new, data)) {
299 freqs.old = tmp; 454 dprintk("acpi_cpufreq_target failed (%d)\n",
300 for_each_cpu_mask(j, online_policy_cpus) { 455 policy->cpu);
301 freqs.cpu = j; 456 return -EAGAIN;
302 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
303 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
304 } 457 }
305 } 458 }
306 459
307 set_cpus_allowed(current, saved_mask); 460 for_each_cpu_mask(i, cmd.mask) {
308 return (result); 461 freqs.cpu = i;
309} 462 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
463 }
464 perf->state = next_perf_state;
310 465
466 return result;
467}
311 468
312static int 469static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
313acpi_cpufreq_verify (
314 struct cpufreq_policy *policy)
315{ 470{
316 unsigned int result = 0; 471 struct acpi_cpufreq_data *data = drv_data[policy->cpu];
317 struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
318 472
319 dprintk("acpi_cpufreq_verify\n"); 473 dprintk("acpi_cpufreq_verify\n");
320 474
321 result = cpufreq_frequency_table_verify(policy, 475 return cpufreq_frequency_table_verify(policy, data->freq_table);
322 data->freq_table);
323
324 return (result);
325} 476}
326 477
327
328static unsigned long 478static unsigned long
329acpi_cpufreq_guess_freq ( 479acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
330 struct cpufreq_acpi_io *data,
331 unsigned int cpu)
332{ 480{
333 struct acpi_processor_performance *perf = data->acpi_data; 481 struct acpi_processor_performance *perf = data->acpi_data;
334 482
335 if (cpu_khz) { 483 if (cpu_khz) {
336 /* search the closest match to cpu_khz */ 484 /* search the closest match to cpu_khz */
@@ -338,16 +486,16 @@ acpi_cpufreq_guess_freq (
338 unsigned long freq; 486 unsigned long freq;
339 unsigned long freqn = perf->states[0].core_frequency * 1000; 487 unsigned long freqn = perf->states[0].core_frequency * 1000;
340 488
341 for (i = 0; i < (perf->state_count - 1); i++) { 489 for (i=0; i<(perf->state_count-1); i++) {
342 freq = freqn; 490 freq = freqn;
343 freqn = perf->states[i+1].core_frequency * 1000; 491 freqn = perf->states[i+1].core_frequency * 1000;
344 if ((2 * cpu_khz) > (freqn + freq)) { 492 if ((2 * cpu_khz) > (freqn + freq)) {
345 perf->state = i; 493 perf->state = i;
346 return (freq); 494 return freq;
347 } 495 }
348 } 496 }
349 perf->state = perf->state_count - 1; 497 perf->state = perf->state_count-1;
350 return (freqn); 498 return freqn;
351 } else { 499 } else {
352 /* assume CPU is at P0... */ 500 /* assume CPU is at P0... */
353 perf->state = 0; 501 perf->state = 0;
@@ -355,7 +503,6 @@ acpi_cpufreq_guess_freq (
355 } 503 }
356} 504}
357 505
358
359/* 506/*
360 * acpi_cpufreq_early_init - initialize ACPI P-States library 507 * acpi_cpufreq_early_init - initialize ACPI P-States library
361 * 508 *
@@ -364,30 +511,34 @@ acpi_cpufreq_guess_freq (
364 * do _PDC and _PSD and find out the processor dependency for the 511 * do _PDC and _PSD and find out the processor dependency for the
365 * actual init that will happen later... 512 * actual init that will happen later...
366 */ 513 */
367static int acpi_cpufreq_early_init_acpi(void) 514static int acpi_cpufreq_early_init(void)
368{ 515{
369 struct acpi_processor_performance *data; 516 struct acpi_processor_performance *data;
370 unsigned int i, j; 517 cpumask_t covered;
518 unsigned int i, j;
371 519
372 dprintk("acpi_cpufreq_early_init\n"); 520 dprintk("acpi_cpufreq_early_init\n");
373 521
374 for_each_possible_cpu(i) { 522 for_each_possible_cpu(i) {
375 data = kzalloc(sizeof(struct acpi_processor_performance), 523 data = kzalloc(sizeof(struct acpi_processor_performance),
376 GFP_KERNEL); 524 GFP_KERNEL);
377 if (!data) { 525 if (!data) {
378 for_each_possible_cpu(j) { 526 for_each_cpu_mask(j, covered) {
379 kfree(acpi_perf_data[j]); 527 kfree(acpi_perf_data[j]);
380 acpi_perf_data[j] = NULL; 528 acpi_perf_data[j] = NULL;
381 } 529 }
382 return (-ENOMEM); 530 return -ENOMEM;
383 } 531 }
384 acpi_perf_data[i] = data; 532 acpi_perf_data[i] = data;
533 cpu_set(i, covered);
385 } 534 }
386 535
387 /* Do initialization in ACPI core */ 536 /* Do initialization in ACPI core */
388 return acpi_processor_preregister_performance(acpi_perf_data); 537 acpi_processor_preregister_performance(acpi_perf_data);
538 return 0;
389} 539}
390 540
541#ifdef CONFIG_SMP
391/* 542/*
392 * Some BIOSes do SW_ANY coordination internally, either set it up in hw 543 * Some BIOSes do SW_ANY coordination internally, either set it up in hw
393 * or do it in BIOS firmware and won't inform about it to OS. If not 544 * or do it in BIOS firmware and won't inform about it to OS. If not
@@ -414,39 +565,42 @@ static struct dmi_system_id sw_any_bug_dmi_table[] = {
414 }, 565 },
415 { } 566 { }
416}; 567};
568#endif
417 569
418static int 570static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
419acpi_cpufreq_cpu_init (
420 struct cpufreq_policy *policy)
421{ 571{
422 unsigned int i; 572 unsigned int i;
423 unsigned int cpu = policy->cpu; 573 unsigned int valid_states = 0;
424 struct cpufreq_acpi_io *data; 574 unsigned int cpu = policy->cpu;
425 unsigned int result = 0; 575 struct acpi_cpufreq_data *data;
576 unsigned int result = 0;
426 struct cpuinfo_x86 *c = &cpu_data[policy->cpu]; 577 struct cpuinfo_x86 *c = &cpu_data[policy->cpu];
427 struct acpi_processor_performance *perf; 578 struct acpi_processor_performance *perf;
428 579
429 dprintk("acpi_cpufreq_cpu_init\n"); 580 dprintk("acpi_cpufreq_cpu_init\n");
430 581
431 if (!acpi_perf_data[cpu]) 582 if (!acpi_perf_data[cpu])
432 return (-ENODEV); 583 return -ENODEV;
433 584
434 data = kzalloc(sizeof(struct cpufreq_acpi_io), GFP_KERNEL); 585 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
435 if (!data) 586 if (!data)
436 return (-ENOMEM); 587 return -ENOMEM;
437 588
438 data->acpi_data = acpi_perf_data[cpu]; 589 data->acpi_data = acpi_perf_data[cpu];
439 acpi_io_data[cpu] = data; 590 drv_data[cpu] = data;
440 591
441 result = acpi_processor_register_performance(data->acpi_data, cpu); 592 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
593 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
442 594
595 result = acpi_processor_register_performance(data->acpi_data, cpu);
443 if (result) 596 if (result)
444 goto err_free; 597 goto err_free;
445 598
446 perf = data->acpi_data; 599 perf = data->acpi_data;
447 policy->shared_type = perf->shared_type; 600 policy->shared_type = perf->shared_type;
601
448 /* 602 /*
449 * Will let policy->cpus know about dependency only when software 603 * Will let policy->cpus know about dependency only when software
450 * coordination is required. 604 * coordination is required.
451 */ 605 */
452 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL || 606 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
@@ -462,10 +616,6 @@ acpi_cpufreq_cpu_init (
462 } 616 }
463#endif 617#endif
464 618
465 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
466 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
467 }
468
469 /* capability check */ 619 /* capability check */
470 if (perf->state_count <= 1) { 620 if (perf->state_count <= 1) {
471 dprintk("No P-States\n"); 621 dprintk("No P-States\n");
@@ -473,17 +623,33 @@ acpi_cpufreq_cpu_init (
473 goto err_unreg; 623 goto err_unreg;
474 } 624 }
475 625
476 if ((perf->control_register.space_id != ACPI_ADR_SPACE_SYSTEM_IO) || 626 if (perf->control_register.space_id != perf->status_register.space_id) {
477 (perf->status_register.space_id != ACPI_ADR_SPACE_SYSTEM_IO)) {
478 dprintk("Unsupported address space [%d, %d]\n",
479 (u32) (perf->control_register.space_id),
480 (u32) (perf->status_register.space_id));
481 result = -ENODEV; 627 result = -ENODEV;
482 goto err_unreg; 628 goto err_unreg;
483 } 629 }
484 630
485 /* alloc freq_table */ 631 switch (perf->control_register.space_id) {
486 data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) * (perf->state_count + 1), GFP_KERNEL); 632 case ACPI_ADR_SPACE_SYSTEM_IO:
633 dprintk("SYSTEM IO addr space\n");
634 data->cpu_feature = SYSTEM_IO_CAPABLE;
635 break;
636 case ACPI_ADR_SPACE_FIXED_HARDWARE:
637 dprintk("HARDWARE addr space\n");
638 if (!check_est_cpu(cpu)) {
639 result = -ENODEV;
640 goto err_unreg;
641 }
642 data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
643 break;
644 default:
645 dprintk("Unknown addr space %d\n",
646 (u32) (perf->control_register.space_id));
647 result = -ENODEV;
648 goto err_unreg;
649 }
650
651 data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
652 (perf->state_count+1), GFP_KERNEL);
487 if (!data->freq_table) { 653 if (!data->freq_table) {
488 result = -ENOMEM; 654 result = -ENOMEM;
489 goto err_unreg; 655 goto err_unreg;
@@ -492,129 +658,141 @@ acpi_cpufreq_cpu_init (
492 /* detect transition latency */ 658 /* detect transition latency */
493 policy->cpuinfo.transition_latency = 0; 659 policy->cpuinfo.transition_latency = 0;
494 for (i=0; i<perf->state_count; i++) { 660 for (i=0; i<perf->state_count; i++) {
495 if ((perf->states[i].transition_latency * 1000) > policy->cpuinfo.transition_latency) 661 if ((perf->states[i].transition_latency * 1000) >
496 policy->cpuinfo.transition_latency = perf->states[i].transition_latency * 1000; 662 policy->cpuinfo.transition_latency)
663 policy->cpuinfo.transition_latency =
664 perf->states[i].transition_latency * 1000;
497 } 665 }
498 policy->governor = CPUFREQ_DEFAULT_GOVERNOR; 666 policy->governor = CPUFREQ_DEFAULT_GOVERNOR;
499 667
500 /* The current speed is unknown and not detectable by ACPI... */ 668 data->max_freq = perf->states[0].core_frequency * 1000;
501 policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
502
503 /* table init */ 669 /* table init */
504 for (i=0; i<=perf->state_count; i++) 670 for (i=0; i<perf->state_count; i++) {
505 { 671 if (i>0 && perf->states[i].core_frequency ==
506 data->freq_table[i].index = i; 672 perf->states[i-1].core_frequency)
507 if (i<perf->state_count) 673 continue;
508 data->freq_table[i].frequency = perf->states[i].core_frequency * 1000; 674
509 else 675 data->freq_table[valid_states].index = i;
510 data->freq_table[i].frequency = CPUFREQ_TABLE_END; 676 data->freq_table[valid_states].frequency =
677 perf->states[i].core_frequency * 1000;
678 valid_states++;
511 } 679 }
680 data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
681 perf->state = 0;
512 682
513 result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table); 683 result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
514 if (result) { 684 if (result)
515 goto err_freqfree; 685 goto err_freqfree;
686
687 switch (perf->control_register.space_id) {
688 case ACPI_ADR_SPACE_SYSTEM_IO:
689 /* Current speed is unknown and not detectable by IO port */
690 policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
691 break;
692 case ACPI_ADR_SPACE_FIXED_HARDWARE:
693 acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
694 policy->cur = get_cur_freq_on_cpu(cpu);
695 break;
696 default:
697 break;
516 } 698 }
517 699
518 /* notify BIOS that we exist */ 700 /* notify BIOS that we exist */
519 acpi_processor_notify_smm(THIS_MODULE); 701 acpi_processor_notify_smm(THIS_MODULE);
520 702
521 printk(KERN_INFO "acpi-cpufreq: CPU%u - ACPI performance management activated.\n", 703 /* Check for APERF/MPERF support in hardware */
522 cpu); 704 if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) {
705 unsigned int ecx;
706 ecx = cpuid_ecx(6);
707 if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
708 acpi_cpufreq_driver.getavg = get_measured_perf;
709 }
710
711 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
523 for (i = 0; i < perf->state_count; i++) 712 for (i = 0; i < perf->state_count; i++)
524 dprintk(" %cP%d: %d MHz, %d mW, %d uS\n", 713 dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",
525 (i == perf->state?'*':' '), i, 714 (i == perf->state ? '*' : ' '), i,
526 (u32) perf->states[i].core_frequency, 715 (u32) perf->states[i].core_frequency,
527 (u32) perf->states[i].power, 716 (u32) perf->states[i].power,
528 (u32) perf->states[i].transition_latency); 717 (u32) perf->states[i].transition_latency);
529 718
530 cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu); 719 cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
531 720
532 /* 721 /*
533 * the first call to ->target() should result in us actually 722 * the first call to ->target() should result in us actually
534 * writing something to the appropriate registers. 723 * writing something to the appropriate registers.
535 */ 724 */
536 data->resume = 1; 725 data->resume = 1;
537
538 return (result);
539 726
540 err_freqfree: 727 return result;
728
729err_freqfree:
541 kfree(data->freq_table); 730 kfree(data->freq_table);
542 err_unreg: 731err_unreg:
543 acpi_processor_unregister_performance(perf, cpu); 732 acpi_processor_unregister_performance(perf, cpu);
544 err_free: 733err_free:
545 kfree(data); 734 kfree(data);
546 acpi_io_data[cpu] = NULL; 735 drv_data[cpu] = NULL;
547 736
548 return (result); 737 return result;
549} 738}
550 739
551 740static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
552static int
553acpi_cpufreq_cpu_exit (
554 struct cpufreq_policy *policy)
555{ 741{
556 struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; 742 struct acpi_cpufreq_data *data = drv_data[policy->cpu];
557
558 743
559 dprintk("acpi_cpufreq_cpu_exit\n"); 744 dprintk("acpi_cpufreq_cpu_exit\n");
560 745
561 if (data) { 746 if (data) {
562 cpufreq_frequency_table_put_attr(policy->cpu); 747 cpufreq_frequency_table_put_attr(policy->cpu);
563 acpi_io_data[policy->cpu] = NULL; 748 drv_data[policy->cpu] = NULL;
564 acpi_processor_unregister_performance(data->acpi_data, policy->cpu); 749 acpi_processor_unregister_performance(data->acpi_data,
750 policy->cpu);
565 kfree(data); 751 kfree(data);
566 } 752 }
567 753
568 return (0); 754 return 0;
569} 755}
570 756
571static int 757static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
572acpi_cpufreq_resume (
573 struct cpufreq_policy *policy)
574{ 758{
575 struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu]; 759 struct acpi_cpufreq_data *data = drv_data[policy->cpu];
576
577 760
578 dprintk("acpi_cpufreq_resume\n"); 761 dprintk("acpi_cpufreq_resume\n");
579 762
580 data->resume = 1; 763 data->resume = 1;
581 764
582 return (0); 765 return 0;
583} 766}
584 767
585 768static struct freq_attr *acpi_cpufreq_attr[] = {
586static struct freq_attr* acpi_cpufreq_attr[] = {
587 &cpufreq_freq_attr_scaling_available_freqs, 769 &cpufreq_freq_attr_scaling_available_freqs,
588 NULL, 770 NULL,
589}; 771};
590 772
591static struct cpufreq_driver acpi_cpufreq_driver = { 773static struct cpufreq_driver acpi_cpufreq_driver = {
592 .verify = acpi_cpufreq_verify, 774 .verify = acpi_cpufreq_verify,
593 .target = acpi_cpufreq_target, 775 .target = acpi_cpufreq_target,
594 .init = acpi_cpufreq_cpu_init, 776 .init = acpi_cpufreq_cpu_init,
595 .exit = acpi_cpufreq_cpu_exit, 777 .exit = acpi_cpufreq_cpu_exit,
596 .resume = acpi_cpufreq_resume, 778 .resume = acpi_cpufreq_resume,
597 .name = "acpi-cpufreq", 779 .name = "acpi-cpufreq",
598 .owner = THIS_MODULE, 780 .owner = THIS_MODULE,
599 .attr = acpi_cpufreq_attr, 781 .attr = acpi_cpufreq_attr,
600}; 782};
601 783
602 784static int __init acpi_cpufreq_init(void)
603static int __init
604acpi_cpufreq_init (void)
605{ 785{
606 dprintk("acpi_cpufreq_init\n"); 786 dprintk("acpi_cpufreq_init\n");
607 787
608 acpi_cpufreq_early_init_acpi(); 788 acpi_cpufreq_early_init();
609 789
610 return cpufreq_register_driver(&acpi_cpufreq_driver); 790 return cpufreq_register_driver(&acpi_cpufreq_driver);
611} 791}
612 792
613 793static void __exit acpi_cpufreq_exit(void)
614static void __exit
615acpi_cpufreq_exit (void)
616{ 794{
617 unsigned int i; 795 unsigned int i;
618 dprintk("acpi_cpufreq_exit\n"); 796 dprintk("acpi_cpufreq_exit\n");
619 797
620 cpufreq_unregister_driver(&acpi_cpufreq_driver); 798 cpufreq_unregister_driver(&acpi_cpufreq_driver);
@@ -627,7 +805,9 @@ acpi_cpufreq_exit (void)
627} 805}
628 806
629module_param(acpi_pstate_strict, uint, 0644); 807module_param(acpi_pstate_strict, uint, 0644);
630MODULE_PARM_DESC(acpi_pstate_strict, "value 0 or non-zero. non-zero -> strict ACPI checks are performed during frequency changes."); 808MODULE_PARM_DESC(acpi_pstate_strict,
809 "value 0 or non-zero. non-zero -> strict ACPI checks are "
810 "performed during frequency changes.");
631 811
632late_initcall(acpi_cpufreq_init); 812late_initcall(acpi_cpufreq_init);
633module_exit(acpi_cpufreq_exit); 813module_exit(acpi_cpufreq_exit);
diff --git a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
index 92afa3bc84f1..6667e9cceb9f 100644
--- a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
@@ -447,7 +447,6 @@ static int __init cpufreq_gx_init(void)
447 int ret; 447 int ret;
448 struct gxfreq_params *params; 448 struct gxfreq_params *params;
449 struct pci_dev *gx_pci; 449 struct pci_dev *gx_pci;
450 u32 class_rev;
451 450
452 /* Test if we have the right hardware */ 451 /* Test if we have the right hardware */
453 if ((gx_pci = gx_detect_chipset()) == NULL) 452 if ((gx_pci = gx_detect_chipset()) == NULL)
@@ -472,8 +471,7 @@ static int __init cpufreq_gx_init(void)
472 pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2)); 471 pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
473 pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration)); 472 pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
474 pci_read_config_byte(params->cs55x0, PCI_MODOFF, &(params->off_duration)); 473 pci_read_config_byte(params->cs55x0, PCI_MODOFF, &(params->off_duration));
475 pci_read_config_dword(params->cs55x0, PCI_CLASS_REVISION, &class_rev); 474 pci_read_config_byte(params->cs55x0, PCI_REVISION_ID, &params->pci_rev);
476 params->pci_rev = class_rev && 0xff;
477 475
478 if ((ret = cpufreq_register_driver(&gx_suspmod_driver))) { 476 if ((ret = cpufreq_register_driver(&gx_suspmod_driver))) {
479 kfree(params); 477 kfree(params);
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c
index 7233abe5d695..e940e00b96c9 100644
--- a/arch/i386/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c
@@ -52,6 +52,10 @@
52#define CPU_EZRA_T 4 52#define CPU_EZRA_T 4
53#define CPU_NEHEMIAH 5 53#define CPU_NEHEMIAH 5
54 54
55/* Flags */
56#define USE_ACPI_C3 (1 << 1)
57#define USE_NORTHBRIDGE (1 << 2)
58
55static int cpu_model; 59static int cpu_model;
56static unsigned int numscales=16; 60static unsigned int numscales=16;
57static unsigned int fsb; 61static unsigned int fsb;
@@ -68,7 +72,7 @@ static unsigned int minmult, maxmult;
68static int can_scale_voltage; 72static int can_scale_voltage;
69static struct acpi_processor *pr = NULL; 73static struct acpi_processor *pr = NULL;
70static struct acpi_processor_cx *cx = NULL; 74static struct acpi_processor_cx *cx = NULL;
71static int port22_en; 75static u8 longhaul_flags;
72 76
73/* Module parameters */ 77/* Module parameters */
74static int scale_voltage; 78static int scale_voltage;
@@ -80,7 +84,6 @@ static int ignore_latency;
80/* Clock ratios multiplied by 10 */ 84/* Clock ratios multiplied by 10 */
81static int clock_ratio[32]; 85static int clock_ratio[32];
82static int eblcr_table[32]; 86static int eblcr_table[32];
83static unsigned int highest_speed, lowest_speed; /* kHz */
84static int longhaul_version; 87static int longhaul_version;
85static struct cpufreq_frequency_table *longhaul_table; 88static struct cpufreq_frequency_table *longhaul_table;
86 89
@@ -178,7 +181,7 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
178 safe_halt(); 181 safe_halt();
179 /* Change frequency on next halt or sleep */ 182 /* Change frequency on next halt or sleep */
180 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val); 183 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
181 if (port22_en) { 184 if (!cx_address) {
182 ACPI_FLUSH_CPU_CACHE(); 185 ACPI_FLUSH_CPU_CACHE();
183 /* Invoke C1 */ 186 /* Invoke C1 */
184 halt(); 187 halt();
@@ -189,7 +192,6 @@ static void do_powersaver(int cx_address, unsigned int clock_ratio_index)
189 /* Dummy op - must do something useless after P_LVL3 read */ 192 /* Dummy op - must do something useless after P_LVL3 read */
190 t = inl(acpi_fadt.xpm_tmr_blk.address); 193 t = inl(acpi_fadt.xpm_tmr_blk.address);
191 } 194 }
192
193 /* Disable bus ratio bit */ 195 /* Disable bus ratio bit */
194 local_irq_disable(); 196 local_irq_disable();
195 longhaul.bits.RevisionKey = longhaul.bits.RevisionID; 197 longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
@@ -243,15 +245,14 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
243 outb(0xFF,0xA1); /* Overkill */ 245 outb(0xFF,0xA1); /* Overkill */
244 outb(0xFE,0x21); /* TMR0 only */ 246 outb(0xFE,0x21); /* TMR0 only */
245 247
246 if (pr->flags.bm_control) { 248 if (longhaul_flags & USE_NORTHBRIDGE) {
249 /* Disable AGP and PCI arbiters */
250 outb(3, 0x22);
251 } else if ((pr != NULL) && pr->flags.bm_control) {
247 /* Disable bus master arbitration */ 252 /* Disable bus master arbitration */
248 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1, 253 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1,
249 ACPI_MTX_DO_NOT_LOCK); 254 ACPI_MTX_DO_NOT_LOCK);
250 } else if (port22_en) {
251 /* Disable AGP and PCI arbiters */
252 outb(3, 0x22);
253 } 255 }
254
255 switch (longhaul_version) { 256 switch (longhaul_version) {
256 257
257 /* 258 /*
@@ -278,22 +279,25 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
278 * to work in practice. 279 * to work in practice.
279 */ 280 */
280 case TYPE_POWERSAVER: 281 case TYPE_POWERSAVER:
281 /* Don't allow wakeup */ 282 if (longhaul_flags & USE_ACPI_C3) {
282 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0, 283 /* Don't allow wakeup */
283 ACPI_MTX_DO_NOT_LOCK); 284 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0,
284 do_powersaver(cx->address, clock_ratio_index); 285 ACPI_MTX_DO_NOT_LOCK);
286 do_powersaver(cx->address, clock_ratio_index);
287 } else {
288 do_powersaver(0, clock_ratio_index);
289 }
285 break; 290 break;
286 } 291 }
287 292
288 if (pr->flags.bm_control) { 293 if (longhaul_flags & USE_NORTHBRIDGE) {
294 /* Enable arbiters */
295 outb(0, 0x22);
296 } else if ((pr != NULL) && pr->flags.bm_control) {
289 /* Enable bus master arbitration */ 297 /* Enable bus master arbitration */
290 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0, 298 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0,
291 ACPI_MTX_DO_NOT_LOCK); 299 ACPI_MTX_DO_NOT_LOCK);
292 } else if (port22_en) {
293 /* Enable arbiters */
294 outb(0, 0x22);
295 } 300 }
296
297 outb(pic2_mask,0xA1); /* restore mask */ 301 outb(pic2_mask,0xA1); /* restore mask */
298 outb(pic1_mask,0x21); 302 outb(pic1_mask,0x21);
299 303
@@ -314,12 +318,12 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
314 318
315#define ROUNDING 0xf 319#define ROUNDING 0xf
316 320
317static int _guess(int guess) 321static int _guess(int guess, int mult)
318{ 322{
319 int target; 323 int target;
320 324
321 target = ((maxmult/10)*guess); 325 target = ((mult/10)*guess);
322 if (maxmult%10 != 0) 326 if (mult%10 != 0)
323 target += (guess/2); 327 target += (guess/2);
324 target += ROUNDING/2; 328 target += ROUNDING/2;
325 target &= ~ROUNDING; 329 target &= ~ROUNDING;
@@ -327,17 +331,17 @@ static int _guess(int guess)
327} 331}
328 332
329 333
330static int guess_fsb(void) 334static int guess_fsb(int mult)
331{ 335{
332 int speed = (cpu_khz/1000); 336 int speed = (cpu_khz/1000);
333 int i; 337 int i;
334 int speeds[3] = { 66, 100, 133 }; 338 int speeds[] = { 66, 100, 133, 200 };
335 339
336 speed += ROUNDING/2; 340 speed += ROUNDING/2;
337 speed &= ~ROUNDING; 341 speed &= ~ROUNDING;
338 342
339 for (i=0; i<3; i++) { 343 for (i=0; i<4; i++) {
340 if (_guess(speeds[i]) == speed) 344 if (_guess(speeds[i], mult) == speed)
341 return speeds[i]; 345 return speeds[i];
342 } 346 }
343 return 0; 347 return 0;
@@ -354,9 +358,7 @@ static int __init longhaul_get_ranges(void)
354 130, 150, 160, 140, -1, 155, -1, 145 }; 358 130, 150, 160, 140, -1, 155, -1, 145 };
355 unsigned int j, k = 0; 359 unsigned int j, k = 0;
356 union msr_longhaul longhaul; 360 union msr_longhaul longhaul;
357 unsigned long lo, hi; 361 int mult = 0;
358 unsigned int eblcr_fsb_table_v1[] = { 66, 133, 100, -1 };
359 unsigned int eblcr_fsb_table_v2[] = { 133, 100, -1, 66 };
360 362
361 switch (longhaul_version) { 363 switch (longhaul_version) {
362 case TYPE_LONGHAUL_V1: 364 case TYPE_LONGHAUL_V1:
@@ -364,30 +366,18 @@ static int __init longhaul_get_ranges(void)
364 /* Ugh, Longhaul v1 didn't have the min/max MSRs. 366 /* Ugh, Longhaul v1 didn't have the min/max MSRs.
365 Assume min=3.0x & max = whatever we booted at. */ 367 Assume min=3.0x & max = whatever we booted at. */
366 minmult = 30; 368 minmult = 30;
367 maxmult = longhaul_get_cpu_mult(); 369 maxmult = mult = longhaul_get_cpu_mult();
368 rdmsr (MSR_IA32_EBL_CR_POWERON, lo, hi);
369 invalue = (lo & (1<<18|1<<19)) >>18;
370 if (cpu_model==CPU_SAMUEL || cpu_model==CPU_SAMUEL2)
371 fsb = eblcr_fsb_table_v1[invalue];
372 else
373 fsb = guess_fsb();
374 break; 370 break;
375 371
376 case TYPE_POWERSAVER: 372 case TYPE_POWERSAVER:
377 /* Ezra-T */ 373 /* Ezra-T */
378 if (cpu_model==CPU_EZRA_T) { 374 if (cpu_model==CPU_EZRA_T) {
375 minmult = 30;
379 rdmsrl (MSR_VIA_LONGHAUL, longhaul.val); 376 rdmsrl (MSR_VIA_LONGHAUL, longhaul.val);
380 invalue = longhaul.bits.MaxMHzBR; 377 invalue = longhaul.bits.MaxMHzBR;
381 if (longhaul.bits.MaxMHzBR4) 378 if (longhaul.bits.MaxMHzBR4)
382 invalue += 16; 379 invalue += 16;
383 maxmult=ezra_t_multipliers[invalue]; 380 maxmult = mult = ezra_t_multipliers[invalue];
384
385 invalue = longhaul.bits.MinMHzBR;
386 if (longhaul.bits.MinMHzBR4 == 1)
387 minmult = 30;
388 else
389 minmult = ezra_t_multipliers[invalue];
390 fsb = eblcr_fsb_table_v2[longhaul.bits.MaxMHzFSB];
391 break; 381 break;
392 } 382 }
393 383
@@ -407,21 +397,16 @@ static int __init longhaul_get_ranges(void)
407 * But it works, so we don't grumble. 397 * But it works, so we don't grumble.
408 */ 398 */
409 minmult=40; 399 minmult=40;
410 maxmult=longhaul_get_cpu_mult(); 400 maxmult = mult = longhaul_get_cpu_mult();
411
412 /* Starting with the 1.2GHz parts, theres a 200MHz bus. */
413 if ((cpu_khz/1000) > 1200)
414 fsb = 200;
415 else
416 fsb = eblcr_fsb_table_v2[longhaul.bits.MaxMHzFSB];
417 break; 401 break;
418 } 402 }
419 } 403 }
404 fsb = guess_fsb(mult);
420 405
421 dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n", 406 dprintk ("MinMult:%d.%dx MaxMult:%d.%dx\n",
422 minmult/10, minmult%10, maxmult/10, maxmult%10); 407 minmult/10, minmult%10, maxmult/10, maxmult%10);
423 408
424 if (fsb == -1) { 409 if (fsb == 0) {
425 printk (KERN_INFO PFX "Invalid (reserved) FSB!\n"); 410 printk (KERN_INFO PFX "Invalid (reserved) FSB!\n");
426 return -EINVAL; 411 return -EINVAL;
427 } 412 }
@@ -583,6 +568,10 @@ static int enable_arbiter_disable(void)
583 if (dev == NULL) { 568 if (dev == NULL) {
584 reg = 0x76; 569 reg = 0x76;
585 dev = pci_find_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_862X_0, NULL); 570 dev = pci_find_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_862X_0, NULL);
571 /* Find CN400 V-Link host bridge */
572 if (dev == NULL)
573 dev = pci_find_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
574
586 } 575 }
587 if (dev != NULL) { 576 if (dev != NULL) {
588 /* Enable access to port 0x22 */ 577 /* Enable access to port 0x22 */
@@ -687,27 +676,32 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
687 /* Find ACPI data for processor */ 676 /* Find ACPI data for processor */
688 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX, 677 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, ACPI_UINT32_MAX,
689 &longhaul_walk_callback, NULL, (void *)&pr); 678 &longhaul_walk_callback, NULL, (void *)&pr);
690 if (pr == NULL)
691 goto err_acpi;
692 679
693 if (longhaul_version == TYPE_POWERSAVER) { 680 /* Check ACPI support for C3 state */
694 /* Check ACPI support for C3 state */ 681 if ((pr != NULL) && (longhaul_version == TYPE_POWERSAVER)) {
695 cx = &pr->power.states[ACPI_STATE_C3]; 682 cx = &pr->power.states[ACPI_STATE_C3];
696 if (cx->address > 0 && 683 if (cx->address > 0 &&
697 (cx->latency <= 1000 || ignore_latency != 0) ) { 684 (cx->latency <= 1000 || ignore_latency != 0) ) {
685 longhaul_flags |= USE_ACPI_C3;
698 goto print_support_type; 686 goto print_support_type;
699 } 687 }
700 } 688 }
689 /* Check if northbridge is friendly */
690 if (enable_arbiter_disable()) {
691 longhaul_flags |= USE_NORTHBRIDGE;
692 goto print_support_type;
693 }
694
695 /* No ACPI C3 or we can't use it */
701 /* Check ACPI support for bus master arbiter disable */ 696 /* Check ACPI support for bus master arbiter disable */
702 if (!pr->flags.bm_control) { 697 if ((pr == NULL) || !(pr->flags.bm_control)) {
703 if (enable_arbiter_disable()) { 698 printk(KERN_ERR PFX
704 port22_en = 1; 699 "No ACPI support. Unsupported northbridge.\n");
705 } else { 700 return -ENODEV;
706 goto err_acpi;
707 }
708 } 701 }
702
709print_support_type: 703print_support_type:
710 if (!port22_en) { 704 if (!(longhaul_flags & USE_NORTHBRIDGE)) {
711 printk (KERN_INFO PFX "Using ACPI support.\n"); 705 printk (KERN_INFO PFX "Using ACPI support.\n");
712 } else { 706 } else {
713 printk (KERN_INFO PFX "Using northbridge support.\n"); 707 printk (KERN_INFO PFX "Using northbridge support.\n");
@@ -732,10 +726,6 @@ print_support_type:
732 cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu); 726 cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
733 727
734 return 0; 728 return 0;
735
736err_acpi:
737 printk(KERN_ERR PFX "No ACPI support. No VT8601 or VT8623 northbridge. Aborting.\n");
738 return -ENODEV;
739} 729}
740 730
741static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy) 731static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
@@ -770,8 +760,8 @@ static int __init longhaul_init(void)
770 760
771#ifdef CONFIG_SMP 761#ifdef CONFIG_SMP
772 if (num_online_cpus() > 1) { 762 if (num_online_cpus() > 1) {
773 return -ENODEV;
774 printk(KERN_ERR PFX "More than 1 CPU detected, longhaul disabled.\n"); 763 printk(KERN_ERR PFX "More than 1 CPU detected, longhaul disabled.\n");
764 return -ENODEV;
775 } 765 }
776#endif 766#endif
777#ifdef CONFIG_X86_IO_APIC 767#ifdef CONFIG_X86_IO_APIC
@@ -783,8 +773,10 @@ static int __init longhaul_init(void)
783 switch (c->x86_model) { 773 switch (c->x86_model) {
784 case 6 ... 9: 774 case 6 ... 9:
785 return cpufreq_register_driver(&longhaul_driver); 775 return cpufreq_register_driver(&longhaul_driver);
776 case 10:
777 printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
786 default: 778 default:
787 printk (KERN_INFO PFX "Unknown VIA CPU. Contact davej@codemonkey.org.uk\n"); 779 ;;
788 } 780 }
789 781
790 return -ENODEV; 782 return -ENODEV;
diff --git a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
index 304d2eaa4a1b..bec50170b75a 100644
--- a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
@@ -163,29 +163,27 @@ static int cpufreq_p4_verify(struct cpufreq_policy *policy)
163 163
164static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) 164static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
165{ 165{
166 if ((c->x86 == 0x06) && (c->x86_model == 0x09)) { 166 if (c->x86 == 0x06) {
167 /* Pentium M (Banias) */ 167 if (cpu_has(c, X86_FEATURE_EST))
168 printk(KERN_WARNING PFX "Warning: Pentium M detected. " 168 printk(KERN_WARNING PFX "Warning: EST-capable CPU detected. "
169 "The speedstep_centrino module offers voltage scaling" 169 "The acpi-cpufreq module offers voltage scaling"
170 " in addition of frequency scaling. You should use " 170 " in addition of frequency scaling. You should use "
171 "that instead of p4-clockmod, if possible.\n"); 171 "that instead of p4-clockmod, if possible.\n");
172 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PM); 172 switch (c->x86_model) {
173 } 173 case 0x0E: /* Core */
174 174 case 0x0F: /* Core Duo */
175 if ((c->x86 == 0x06) && (c->x86_model == 0x0D)) { 175 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
176 /* Pentium M (Dothan) */ 176 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PCORE);
177 printk(KERN_WARNING PFX "Warning: Pentium M detected. " 177 case 0x0D: /* Pentium M (Dothan) */
178 "The speedstep_centrino module offers voltage scaling" 178 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
179 " in addition of frequency scaling. You should use " 179 /* fall through */
180 "that instead of p4-clockmod, if possible.\n"); 180 case 0x09: /* Pentium M (Banias) */
181 /* on P-4s, the TSC runs with constant frequency independent whether 181 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PM);
182 * throttling is active or not. */ 182 }
183 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
184 return speedstep_get_processor_frequency(SPEEDSTEP_PROCESSOR_PM);
185 } 183 }
186 184
187 if (c->x86 != 0xF) { 185 if (c->x86 != 0xF) {
188 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <linux@brodo.de>\n"); 186 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n");
189 return 0; 187 return 0;
190 } 188 }
191 189
diff --git a/arch/i386/kernel/cpu/cpufreq/sc520_freq.c b/arch/i386/kernel/cpu/cpufreq/sc520_freq.c
index ef457d50f4ac..b8fb4b521c62 100644
--- a/arch/i386/kernel/cpu/cpufreq/sc520_freq.c
+++ b/arch/i386/kernel/cpu/cpufreq/sc520_freq.c
@@ -153,6 +153,7 @@ static struct cpufreq_driver sc520_freq_driver = {
153static int __init sc520_freq_init(void) 153static int __init sc520_freq_init(void)
154{ 154{
155 struct cpuinfo_x86 *c = cpu_data; 155 struct cpuinfo_x86 *c = cpu_data;
156 int err;
156 157
157 /* Test if we have the right hardware */ 158 /* Test if we have the right hardware */
158 if(c->x86_vendor != X86_VENDOR_AMD || 159 if(c->x86_vendor != X86_VENDOR_AMD ||
@@ -166,7 +167,11 @@ static int __init sc520_freq_init(void)
166 return -ENOMEM; 167 return -ENOMEM;
167 } 168 }
168 169
169 return cpufreq_register_driver(&sc520_freq_driver); 170 err = cpufreq_register_driver(&sc520_freq_driver);
171 if (err)
172 iounmap(cpuctl);
173
174 return err;
170} 175}
171 176
172 177
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
index e8993baf3d14..f43b987f952b 100644
--- a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -36,6 +36,7 @@
36 36
37#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 37#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
38 38
39#define INTEL_MSR_RANGE (0xffff)
39 40
40struct cpu_id 41struct cpu_id
41{ 42{
@@ -379,6 +380,7 @@ static int centrino_cpu_early_init_acpi(void)
379} 380}
380 381
381 382
383#ifdef CONFIG_SMP
382/* 384/*
383 * Some BIOSes do SW_ANY coordination internally, either set it up in hw 385 * Some BIOSes do SW_ANY coordination internally, either set it up in hw
384 * or do it in BIOS firmware and won't inform about it to OS. If not 386 * or do it in BIOS firmware and won't inform about it to OS. If not
@@ -392,7 +394,6 @@ static int sw_any_bug_found(struct dmi_system_id *d)
392 return 0; 394 return 0;
393} 395}
394 396
395
396static struct dmi_system_id sw_any_bug_dmi_table[] = { 397static struct dmi_system_id sw_any_bug_dmi_table[] = {
397 { 398 {
398 .callback = sw_any_bug_found, 399 .callback = sw_any_bug_found,
@@ -405,7 +406,7 @@ static struct dmi_system_id sw_any_bug_dmi_table[] = {
405 }, 406 },
406 { } 407 { }
407}; 408};
408 409#endif
409 410
410/* 411/*
411 * centrino_cpu_init_acpi - register with ACPI P-States library 412 * centrino_cpu_init_acpi - register with ACPI P-States library
@@ -463,8 +464,9 @@ static int centrino_cpu_init_acpi(struct cpufreq_policy *policy)
463 } 464 }
464 465
465 for (i=0; i<p->state_count; i++) { 466 for (i=0; i<p->state_count; i++) {
466 if (p->states[i].control != p->states[i].status) { 467 if ((p->states[i].control & INTEL_MSR_RANGE) !=
467 dprintk("Different control (%llu) and status values (%llu)\n", 468 (p->states[i].status & INTEL_MSR_RANGE)) {
469 dprintk("Different MSR bits in control (%llu) and status (%llu)\n",
468 p->states[i].control, p->states[i].status); 470 p->states[i].control, p->states[i].status);
469 result = -EINVAL; 471 result = -EINVAL;
470 goto err_unreg; 472 goto err_unreg;
@@ -500,7 +502,7 @@ static int centrino_cpu_init_acpi(struct cpufreq_policy *policy)
500 } 502 }
501 503
502 for (i=0; i<p->state_count; i++) { 504 for (i=0; i<p->state_count; i++) {
503 centrino_model[cpu]->op_points[i].index = p->states[i].control; 505 centrino_model[cpu]->op_points[i].index = p->states[i].control & INTEL_MSR_RANGE;
504 centrino_model[cpu]->op_points[i].frequency = p->states[i].core_frequency * 1000; 506 centrino_model[cpu]->op_points[i].frequency = p->states[i].core_frequency * 1000;
505 dprintk("adding state %i with frequency %u and control value %04x\n", 507 dprintk("adding state %i with frequency %u and control value %04x\n",
506 i, centrino_model[cpu]->op_points[i].frequency, centrino_model[cpu]->op_points[i].index); 508 i, centrino_model[cpu]->op_points[i].frequency, centrino_model[cpu]->op_points[i].index);
@@ -531,6 +533,9 @@ static int centrino_cpu_init_acpi(struct cpufreq_policy *policy)
531 533
532 /* notify BIOS that we exist */ 534 /* notify BIOS that we exist */
533 acpi_processor_notify_smm(THIS_MODULE); 535 acpi_processor_notify_smm(THIS_MODULE);
536 printk("speedstep-centrino with X86_SPEEDSTEP_CENTRINO_ACPI "
537 "config is deprecated.\n "
538 "Use X86_ACPI_CPUFREQ (acpi-cpufreq) instead.\n" );
534 539
535 return 0; 540 return 0;
536 541
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-lib.c b/arch/i386/kernel/cpu/cpufreq/speedstep-lib.c
index 4f46cac155c4..d59277c00911 100644
--- a/arch/i386/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-lib.c
@@ -123,6 +123,36 @@ static unsigned int pentiumM_get_frequency(void)
123 return (msr_tmp * 100 * 1000); 123 return (msr_tmp * 100 * 1000);
124} 124}
125 125
126static unsigned int pentium_core_get_frequency(void)
127{
128 u32 fsb = 0;
129 u32 msr_lo, msr_tmp;
130
131 rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
132 /* see table B-2 of 25366920.pdf */
133 switch (msr_lo & 0x07) {
134 case 5:
135 fsb = 100000;
136 break;
137 case 1:
138 fsb = 133333;
139 break;
140 case 3:
141 fsb = 166667;
142 break;
143 default:
144 printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
145 }
146
147 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
148 dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
149
150 msr_tmp = (msr_lo >> 22) & 0x1f;
151 dprintk("bits 22-26 are 0x%x, speed is %u\n", msr_tmp, (msr_tmp * fsb));
152
153 return (msr_tmp * fsb);
154}
155
126 156
127static unsigned int pentium4_get_frequency(void) 157static unsigned int pentium4_get_frequency(void)
128{ 158{
@@ -174,6 +204,8 @@ static unsigned int pentium4_get_frequency(void)
174unsigned int speedstep_get_processor_frequency(unsigned int processor) 204unsigned int speedstep_get_processor_frequency(unsigned int processor)
175{ 205{
176 switch (processor) { 206 switch (processor) {
207 case SPEEDSTEP_PROCESSOR_PCORE:
208 return pentium_core_get_frequency();
177 case SPEEDSTEP_PROCESSOR_PM: 209 case SPEEDSTEP_PROCESSOR_PM:
178 return pentiumM_get_frequency(); 210 return pentiumM_get_frequency();
179 case SPEEDSTEP_PROCESSOR_P4D: 211 case SPEEDSTEP_PROCESSOR_P4D:
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-lib.h b/arch/i386/kernel/cpu/cpufreq/speedstep-lib.h
index b735429c50b4..b11bcc608cac 100644
--- a/arch/i386/kernel/cpu/cpufreq/speedstep-lib.h
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-lib.h
@@ -22,6 +22,7 @@
22 * the speedstep_get_processor_frequency() call. */ 22 * the speedstep_get_processor_frequency() call. */
23#define SPEEDSTEP_PROCESSOR_PM 0xFFFFFF03 /* Pentium M */ 23#define SPEEDSTEP_PROCESSOR_PM 0xFFFFFF03 /* Pentium M */
24#define SPEEDSTEP_PROCESSOR_P4D 0xFFFFFF04 /* desktop P4 */ 24#define SPEEDSTEP_PROCESSOR_P4D 0xFFFFFF04 /* desktop P4 */
25#define SPEEDSTEP_PROCESSOR_PCORE 0xFFFFFF05 /* Core */
25 26
26/* speedstep states -- only two of them */ 27/* speedstep states -- only two of them */
27 28
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c b/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c
index c28333d53646..ff0d89806114 100644
--- a/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-smi.c
@@ -360,9 +360,6 @@ static int __init speedstep_init(void)
360 case SPEEDSTEP_PROCESSOR_PIII_C: 360 case SPEEDSTEP_PROCESSOR_PIII_C:
361 case SPEEDSTEP_PROCESSOR_PIII_C_EARLY: 361 case SPEEDSTEP_PROCESSOR_PIII_C_EARLY:
362 break; 362 break;
363 case SPEEDSTEP_PROCESSOR_P4M:
364 printk(KERN_INFO "speedstep-smi: you're trying to use this cpufreq driver on a Pentium 4-based CPU. Most likely it will not work.\n");
365 break;
366 default: 363 default:
367 speedstep_processor = 0; 364 speedstep_processor = 0;
368 } 365 }
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index c0c3b59de32c..abcff92f994c 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -173,7 +173,7 @@ static void __cpuinit geode_configure(void)
173 ccr4 = getCx86(CX86_CCR4); 173 ccr4 = getCx86(CX86_CCR4);
174 ccr4 |= 0x38; /* FPU fast, DTE cache, Mem bypass */ 174 ccr4 |= 0x38; /* FPU fast, DTE cache, Mem bypass */
175 175
176 setCx86(CX86_CCR3, ccr3); 176 setCx86(CX86_CCR4, ccr4);
177 177
178 set_cx86_memwb(); 178 set_cx86_memwb();
179 set_cx86_reorder(); 179 set_cx86_reorder();
diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 94a95aa5227e..56fe26584957 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
107 * Note that the workaround only should be initialized once... 107 * Note that the workaround only should be initialized once...
108 */ 108 */
109 c->f00f_bug = 0; 109 c->f00f_bug = 0;
110 if ( c->x86 == 5 ) { 110 if (!paravirt_enabled() && c->x86 == 5) {
111 static int f00f_workaround_enabled = 0; 111 static int f00f_workaround_enabled = 0;
112 112
113 c->f00f_bug = 1; 113 c->f00f_bug = 1;
@@ -195,8 +195,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
195 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 195 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
196 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 196 (c->x86 == 0x6 && c->x86_model >= 0x0e))
197 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); 197 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
198}
199 198
199 if (cpu_has_ds) {
200 unsigned int l1;
201 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
202 if (!(l1 & (1<<11)))
203 set_bit(X86_FEATURE_BTS, c->x86_capability);
204 if (!(l1 & (1<<12)))
205 set_bit(X86_FEATURE_PEBS, c->x86_capability);
206 }
207}
200 208
201static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) 209static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
202{ 210{
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 5c43be47587f..80b4c5d421b1 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -480,12 +480,10 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
480 if (num_cache_leaves == 0) 480 if (num_cache_leaves == 0)
481 return -ENOENT; 481 return -ENOENT;
482 482
483 cpuid4_info[cpu] = kmalloc( 483 cpuid4_info[cpu] = kzalloc(
484 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); 484 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
485 if (unlikely(cpuid4_info[cpu] == NULL)) 485 if (unlikely(cpuid4_info[cpu] == NULL))
486 return -ENOMEM; 486 return -ENOMEM;
487 memset(cpuid4_info[cpu], 0,
488 sizeof(struct _cpuid4_info) * num_cache_leaves);
489 487
490 oldmask = current->cpus_allowed; 488 oldmask = current->cpus_allowed;
491 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); 489 retval = set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -658,17 +656,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
658 return -ENOENT; 656 return -ENOENT;
659 657
660 /* Allocate all required memory */ 658 /* Allocate all required memory */
661 cache_kobject[cpu] = kmalloc(sizeof(struct kobject), GFP_KERNEL); 659 cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL);
662 if (unlikely(cache_kobject[cpu] == NULL)) 660 if (unlikely(cache_kobject[cpu] == NULL))
663 goto err_out; 661 goto err_out;
664 memset(cache_kobject[cpu], 0, sizeof(struct kobject));
665 662
666 index_kobject[cpu] = kmalloc( 663 index_kobject[cpu] = kzalloc(
667 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL); 664 sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
668 if (unlikely(index_kobject[cpu] == NULL)) 665 if (unlikely(index_kobject[cpu] == NULL))
669 goto err_out; 666 goto err_out;
670 memset(index_kobject[cpu], 0,
671 sizeof(struct _index_kobject) * num_cache_leaves);
672 667
673 return 0; 668 return 0;
674 669
diff --git a/arch/i386/kernel/cpu/mcheck/non-fatal.c b/arch/i386/kernel/cpu/mcheck/non-fatal.c
index 1f9153ae5b03..6b5d3518a1c0 100644
--- a/arch/i386/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/i386/kernel/cpu/mcheck/non-fatal.c
@@ -51,10 +51,10 @@ static void mce_checkregs (void *info)
51 } 51 }
52} 52}
53 53
54static void mce_work_fn(void *data); 54static void mce_work_fn(struct work_struct *work);
55static DECLARE_WORK(mce_work, mce_work_fn, NULL); 55static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
56 56
57static void mce_work_fn(void *data) 57static void mce_work_fn(struct work_struct *work)
58{ 58{
59 on_each_cpu(mce_checkregs, NULL, 1, 1); 59 on_each_cpu(mce_checkregs, NULL, 1, 1);
60 schedule_delayed_work(&mce_work, MCE_RATE); 60 schedule_delayed_work(&mce_work, MCE_RATE);
diff --git a/arch/i386/kernel/cpu/mcheck/therm_throt.c b/arch/i386/kernel/cpu/mcheck/therm_throt.c
index 2d8703b7ce65..065005c3f168 100644
--- a/arch/i386/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/i386/kernel/cpu/mcheck/therm_throt.c
@@ -20,6 +20,7 @@
20#include <linux/cpu.h> 20#include <linux/cpu.h>
21#include <asm/cpu.h> 21#include <asm/cpu.h>
22#include <linux/notifier.h> 22#include <linux/notifier.h>
23#include <linux/jiffies.h>
23#include <asm/therm_throt.h> 24#include <asm/therm_throt.h>
24 25
25/* How long to wait between reporting thermal events */ 26/* How long to wait between reporting thermal events */
@@ -115,7 +116,6 @@ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
115 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); 116 return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group);
116} 117}
117 118
118#ifdef CONFIG_HOTPLUG_CPU
119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) 119static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
120{ 120{
121 return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); 121 return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
@@ -152,7 +152,6 @@ static struct notifier_block thermal_throttle_cpu_notifier =
152{ 152{
153 .notifier_call = thermal_throttle_cpu_callback, 153 .notifier_call = thermal_throttle_cpu_callback,
154}; 154};
155#endif /* CONFIG_HOTPLUG_CPU */
156 155
157static __init int thermal_throttle_init_device(void) 156static __init int thermal_throttle_init_device(void)
158{ 157{
diff --git a/arch/i386/kernel/cpu/mtrr/Makefile b/arch/i386/kernel/cpu/mtrr/Makefile
index a25b701ab84e..191fc0533649 100644
--- a/arch/i386/kernel/cpu/mtrr/Makefile
+++ b/arch/i386/kernel/cpu/mtrr/Makefile
@@ -1,5 +1,3 @@
1obj-y := main.o if.o generic.o state.o 1obj-y := main.o if.o generic.o state.o
2obj-y += amd.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3obj-y += cyrix.o
4obj-y += centaur.o
5 3
diff --git a/arch/i386/kernel/cpu/mtrr/amd.c b/arch/i386/kernel/cpu/mtrr/amd.c
index 1a1e04b6fd00..0949cdbf848a 100644
--- a/arch/i386/kernel/cpu/mtrr/amd.c
+++ b/arch/i386/kernel/cpu/mtrr/amd.c
@@ -7,7 +7,7 @@
7 7
8static void 8static void
9amd_get_mtrr(unsigned int reg, unsigned long *base, 9amd_get_mtrr(unsigned int reg, unsigned long *base,
10 unsigned int *size, mtrr_type * type) 10 unsigned long *size, mtrr_type * type)
11{ 11{
12 unsigned long low, high; 12 unsigned long low, high;
13 13
diff --git a/arch/i386/kernel/cpu/mtrr/centaur.c b/arch/i386/kernel/cpu/mtrr/centaur.c
index 33f00ac314ef..cb9aa3a7a7ab 100644
--- a/arch/i386/kernel/cpu/mtrr/centaur.c
+++ b/arch/i386/kernel/cpu/mtrr/centaur.c
@@ -17,7 +17,7 @@ static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */
17 */ 17 */
18 18
19static int 19static int
20centaur_get_free_region(unsigned long base, unsigned long size) 20centaur_get_free_region(unsigned long base, unsigned long size, int replace_reg)
21/* [SUMMARY] Get a free MTRR. 21/* [SUMMARY] Get a free MTRR.
22 <base> The starting (base) address of the region. 22 <base> The starting (base) address of the region.
23 <size> The size (in bytes) of the region. 23 <size> The size (in bytes) of the region.
@@ -26,10 +26,11 @@ centaur_get_free_region(unsigned long base, unsigned long size)
26{ 26{
27 int i, max; 27 int i, max;
28 mtrr_type ltype; 28 mtrr_type ltype;
29 unsigned long lbase; 29 unsigned long lbase, lsize;
30 unsigned int lsize;
31 30
32 max = num_var_ranges; 31 max = num_var_ranges;
32 if (replace_reg >= 0 && replace_reg < max)
33 return replace_reg;
33 for (i = 0; i < max; ++i) { 34 for (i = 0; i < max; ++i) {
34 if (centaur_mcr_reserved & (1 << i)) 35 if (centaur_mcr_reserved & (1 << i))
35 continue; 36 continue;
@@ -49,7 +50,7 @@ mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
49 50
50static void 51static void
51centaur_get_mcr(unsigned int reg, unsigned long *base, 52centaur_get_mcr(unsigned int reg, unsigned long *base,
52 unsigned int *size, mtrr_type * type) 53 unsigned long *size, mtrr_type * type)
53{ 54{
54 *base = centaur_mcr[reg].high >> PAGE_SHIFT; 55 *base = centaur_mcr[reg].high >> PAGE_SHIFT;
55 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; 56 *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT;
diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c
index 9027a987006b..0737a596db43 100644
--- a/arch/i386/kernel/cpu/mtrr/cyrix.c
+++ b/arch/i386/kernel/cpu/mtrr/cyrix.c
@@ -9,7 +9,7 @@ int arr3_protected;
9 9
10static void 10static void
11cyrix_get_arr(unsigned int reg, unsigned long *base, 11cyrix_get_arr(unsigned int reg, unsigned long *base,
12 unsigned int *size, mtrr_type * type) 12 unsigned long *size, mtrr_type * type)
13{ 13{
14 unsigned long flags; 14 unsigned long flags;
15 unsigned char arr, ccr3, rcr, shift; 15 unsigned char arr, ccr3, rcr, shift;
@@ -77,7 +77,7 @@ cyrix_get_arr(unsigned int reg, unsigned long *base,
77} 77}
78 78
79static int 79static int
80cyrix_get_free_region(unsigned long base, unsigned long size) 80cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg)
81/* [SUMMARY] Get a free ARR. 81/* [SUMMARY] Get a free ARR.
82 <base> The starting (base) address of the region. 82 <base> The starting (base) address of the region.
83 <size> The size (in bytes) of the region. 83 <size> The size (in bytes) of the region.
@@ -86,9 +86,24 @@ cyrix_get_free_region(unsigned long base, unsigned long size)
86{ 86{
87 int i; 87 int i;
88 mtrr_type ltype; 88 mtrr_type ltype;
89 unsigned long lbase; 89 unsigned long lbase, lsize;
90 unsigned int lsize;
91 90
91 switch (replace_reg) {
92 case 7:
93 if (size < 0x40)
94 break;
95 case 6:
96 case 5:
97 case 4:
98 return replace_reg;
99 case 3:
100 if (arr3_protected)
101 break;
102 case 2:
103 case 1:
104 case 0:
105 return replace_reg;
106 }
92 /* If we are to set up a region >32M then look at ARR7 immediately */ 107 /* If we are to set up a region >32M then look at ARR7 immediately */
93 if (size > 0x2000) { 108 if (size > 0x2000) {
94 cyrix_get_arr(7, &lbase, &lsize, &ltype); 109 cyrix_get_arr(7, &lbase, &lsize, &ltype);
@@ -214,7 +229,7 @@ static void cyrix_set_arr(unsigned int reg, unsigned long base,
214 229
215typedef struct { 230typedef struct {
216 unsigned long base; 231 unsigned long base;
217 unsigned int size; 232 unsigned long size;
218 mtrr_type type; 233 mtrr_type type;
219} arr_state_t; 234} arr_state_t;
220 235
diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
index 0b61eed8bbd8..f77fc53db654 100644
--- a/arch/i386/kernel/cpu/mtrr/generic.c
+++ b/arch/i386/kernel/cpu/mtrr/generic.c
@@ -3,6 +3,7 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/module.h>
6#include <asm/io.h> 7#include <asm/io.h>
7#include <asm/mtrr.h> 8#include <asm/mtrr.h>
8#include <asm/msr.h> 9#include <asm/msr.h>
@@ -15,12 +16,19 @@ struct mtrr_state {
15 struct mtrr_var_range *var_ranges; 16 struct mtrr_var_range *var_ranges;
16 mtrr_type fixed_ranges[NUM_FIXED_RANGES]; 17 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
17 unsigned char enabled; 18 unsigned char enabled;
19 unsigned char have_fixed;
18 mtrr_type def_type; 20 mtrr_type def_type;
19}; 21};
20 22
21static unsigned long smp_changes_mask; 23static unsigned long smp_changes_mask;
22static struct mtrr_state mtrr_state = {}; 24static struct mtrr_state mtrr_state = {};
23 25
26#undef MODULE_PARAM_PREFIX
27#define MODULE_PARAM_PREFIX "mtrr."
28
29static __initdata int mtrr_show;
30module_param_named(show, mtrr_show, bool, 0);
31
24/* Get the MSR pair relating to a var range */ 32/* Get the MSR pair relating to a var range */
25static void __init 33static void __init
26get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) 34get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
@@ -43,6 +51,14 @@ get_fixed_ranges(mtrr_type * frs)
43 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); 51 rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
44} 52}
45 53
54static void __init print_fixed(unsigned base, unsigned step, const mtrr_type*types)
55{
56 unsigned i;
57
58 for (i = 0; i < 8; ++i, ++types, base += step)
59 printk(KERN_INFO "MTRR %05X-%05X %s\n", base, base + step - 1, mtrr_attrib_to_str(*types));
60}
61
46/* Grab all of the MTRR state for this CPU into *state */ 62/* Grab all of the MTRR state for this CPU into *state */
47void __init get_mtrr_state(void) 63void __init get_mtrr_state(void)
48{ 64{
@@ -58,13 +74,49 @@ void __init get_mtrr_state(void)
58 } 74 }
59 vrs = mtrr_state.var_ranges; 75 vrs = mtrr_state.var_ranges;
60 76
77 rdmsr(MTRRcap_MSR, lo, dummy);
78 mtrr_state.have_fixed = (lo >> 8) & 1;
79
61 for (i = 0; i < num_var_ranges; i++) 80 for (i = 0; i < num_var_ranges; i++)
62 get_mtrr_var_range(i, &vrs[i]); 81 get_mtrr_var_range(i, &vrs[i]);
63 get_fixed_ranges(mtrr_state.fixed_ranges); 82 if (mtrr_state.have_fixed)
83 get_fixed_ranges(mtrr_state.fixed_ranges);
64 84
65 rdmsr(MTRRdefType_MSR, lo, dummy); 85 rdmsr(MTRRdefType_MSR, lo, dummy);
66 mtrr_state.def_type = (lo & 0xff); 86 mtrr_state.def_type = (lo & 0xff);
67 mtrr_state.enabled = (lo & 0xc00) >> 10; 87 mtrr_state.enabled = (lo & 0xc00) >> 10;
88
89 if (mtrr_show) {
90 int high_width;
91
92 printk(KERN_INFO "MTRR default type: %s\n", mtrr_attrib_to_str(mtrr_state.def_type));
93 if (mtrr_state.have_fixed) {
94 printk(KERN_INFO "MTRR fixed ranges %sabled:\n",
95 mtrr_state.enabled & 1 ? "en" : "dis");
96 print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
97 for (i = 0; i < 2; ++i)
98 print_fixed(0x80000 + i * 0x20000, 0x04000, mtrr_state.fixed_ranges + (i + 1) * 8);
99 for (i = 0; i < 8; ++i)
100 print_fixed(0xC0000 + i * 0x08000, 0x01000, mtrr_state.fixed_ranges + (i + 3) * 8);
101 }
102 printk(KERN_INFO "MTRR variable ranges %sabled:\n",
103 mtrr_state.enabled & 2 ? "en" : "dis");
104 high_width = ((size_or_mask ? ffs(size_or_mask) - 1 : 32) - (32 - PAGE_SHIFT) + 3) / 4;
105 for (i = 0; i < num_var_ranges; ++i) {
106 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
107 printk(KERN_INFO "MTRR %u base %0*X%05X000 mask %0*X%05X000 %s\n",
108 i,
109 high_width,
110 mtrr_state.var_ranges[i].base_hi,
111 mtrr_state.var_ranges[i].base_lo >> 12,
112 high_width,
113 mtrr_state.var_ranges[i].mask_hi,
114 mtrr_state.var_ranges[i].mask_lo >> 12,
115 mtrr_attrib_to_str(mtrr_state.var_ranges[i].base_lo & 0xff));
116 else
117 printk(KERN_INFO "MTRR %u disabled\n", i);
118 }
119 }
68} 120}
69 121
70/* Some BIOS's are fucked and don't set all MTRRs the same! */ 122/* Some BIOS's are fucked and don't set all MTRRs the same! */
@@ -95,7 +147,7 @@ void mtrr_wrmsr(unsigned msr, unsigned a, unsigned b)
95 smp_processor_id(), msr, a, b); 147 smp_processor_id(), msr, a, b);
96} 148}
97 149
98int generic_get_free_region(unsigned long base, unsigned long size) 150int generic_get_free_region(unsigned long base, unsigned long size, int replace_reg)
99/* [SUMMARY] Get a free MTRR. 151/* [SUMMARY] Get a free MTRR.
100 <base> The starting (base) address of the region. 152 <base> The starting (base) address of the region.
101 <size> The size (in bytes) of the region. 153 <size> The size (in bytes) of the region.
@@ -104,10 +156,11 @@ int generic_get_free_region(unsigned long base, unsigned long size)
104{ 156{
105 int i, max; 157 int i, max;
106 mtrr_type ltype; 158 mtrr_type ltype;
107 unsigned long lbase; 159 unsigned long lbase, lsize;
108 unsigned lsize;
109 160
110 max = num_var_ranges; 161 max = num_var_ranges;
162 if (replace_reg >= 0 && replace_reg < max)
163 return replace_reg;
111 for (i = 0; i < max; ++i) { 164 for (i = 0; i < max; ++i) {
112 mtrr_if->get(i, &lbase, &lsize, &ltype); 165 mtrr_if->get(i, &lbase, &lsize, &ltype);
113 if (lsize == 0) 166 if (lsize == 0)
@@ -117,7 +170,7 @@ int generic_get_free_region(unsigned long base, unsigned long size)
117} 170}
118 171
119static void generic_get_mtrr(unsigned int reg, unsigned long *base, 172static void generic_get_mtrr(unsigned int reg, unsigned long *base,
120 unsigned int *size, mtrr_type * type) 173 unsigned long *size, mtrr_type *type)
121{ 174{
122 unsigned int mask_lo, mask_hi, base_lo, base_hi; 175 unsigned int mask_lo, mask_hi, base_lo, base_hi;
123 176
@@ -202,7 +255,9 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr)
202 return changed; 255 return changed;
203} 256}
204 257
205static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) 258static u32 deftype_lo, deftype_hi;
259
260static unsigned long set_mtrr_state(void)
206/* [SUMMARY] Set the MTRR state for this CPU. 261/* [SUMMARY] Set the MTRR state for this CPU.
207 <state> The MTRR state information to read. 262 <state> The MTRR state information to read.
208 <ctxt> Some relevant CPU context. 263 <ctxt> Some relevant CPU context.
@@ -217,14 +272,14 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
217 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) 272 if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i]))
218 change_mask |= MTRR_CHANGE_MASK_VARIABLE; 273 change_mask |= MTRR_CHANGE_MASK_VARIABLE;
219 274
220 if (set_fixed_ranges(mtrr_state.fixed_ranges)) 275 if (mtrr_state.have_fixed && set_fixed_ranges(mtrr_state.fixed_ranges))
221 change_mask |= MTRR_CHANGE_MASK_FIXED; 276 change_mask |= MTRR_CHANGE_MASK_FIXED;
222 277
223 /* Set_mtrr_restore restores the old value of MTRRdefType, 278 /* Set_mtrr_restore restores the old value of MTRRdefType,
224 so to set it we fiddle with the saved value */ 279 so to set it we fiddle with the saved value */
225 if ((deftype_lo & 0xff) != mtrr_state.def_type 280 if ((deftype_lo & 0xff) != mtrr_state.def_type
226 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { 281 || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) {
227 deftype_lo |= (mtrr_state.def_type | mtrr_state.enabled << 10); 282 deftype_lo = (deftype_lo & ~0xcff) | mtrr_state.def_type | (mtrr_state.enabled << 10);
228 change_mask |= MTRR_CHANGE_MASK_DEFTYPE; 283 change_mask |= MTRR_CHANGE_MASK_DEFTYPE;
229 } 284 }
230 285
@@ -233,7 +288,6 @@ static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi)
233 288
234 289
235static unsigned long cr4 = 0; 290static unsigned long cr4 = 0;
236static u32 deftype_lo, deftype_hi;
237static DEFINE_SPINLOCK(set_atomicity_lock); 291static DEFINE_SPINLOCK(set_atomicity_lock);
238 292
239/* 293/*
@@ -271,7 +325,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
271 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); 325 rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
272 326
273 /* Disable MTRRs, and set the default type to uncached */ 327 /* Disable MTRRs, and set the default type to uncached */
274 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi); 328 mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
275} 329}
276 330
277static void post_set(void) __releases(set_atomicity_lock) 331static void post_set(void) __releases(set_atomicity_lock)
@@ -300,7 +354,7 @@ static void generic_set_all(void)
300 prepare_set(); 354 prepare_set();
301 355
302 /* Actually set the state */ 356 /* Actually set the state */
303 mask = set_mtrr_state(deftype_lo,deftype_hi); 357 mask = set_mtrr_state();
304 358
305 post_set(); 359 post_set();
306 local_irq_restore(flags); 360 local_irq_restore(flags);
@@ -366,7 +420,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
366 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); 420 printk(KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base);
367 return -EINVAL; 421 return -EINVAL;
368 } 422 }
369 if (!(base + size < 0x70000000 || base > 0x7003FFFF) && 423 if (!(base + size < 0x70000 || base > 0x7003F) &&
370 (type == MTRR_TYPE_WRCOMB 424 (type == MTRR_TYPE_WRCOMB
371 || type == MTRR_TYPE_WRBACK)) { 425 || type == MTRR_TYPE_WRBACK)) {
372 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); 426 printk(KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n");
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index 5ac051bb9d55..5ae1705eafa6 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -17,7 +17,7 @@ extern unsigned int *usage_table;
17 17
18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) 18#define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
19 19
20static char *mtrr_strings[MTRR_NUM_TYPES] = 20static const char *const mtrr_strings[MTRR_NUM_TYPES] =
21{ 21{
22 "uncachable", /* 0 */ 22 "uncachable", /* 0 */
23 "write-combining", /* 1 */ 23 "write-combining", /* 1 */
@@ -28,7 +28,7 @@ static char *mtrr_strings[MTRR_NUM_TYPES] =
28 "write-back", /* 6 */ 28 "write-back", /* 6 */
29}; 29};
30 30
31char *mtrr_attrib_to_str(int x) 31const char *mtrr_attrib_to_str(int x)
32{ 32{
33 return (x <= 6) ? mtrr_strings[x] : "?"; 33 return (x <= 6) ? mtrr_strings[x] : "?";
34} 34}
@@ -44,10 +44,9 @@ mtrr_file_add(unsigned long base, unsigned long size,
44 44
45 max = num_var_ranges; 45 max = num_var_ranges;
46 if (fcount == NULL) { 46 if (fcount == NULL) {
47 fcount = kmalloc(max * sizeof *fcount, GFP_KERNEL); 47 fcount = kzalloc(max * sizeof *fcount, GFP_KERNEL);
48 if (!fcount) 48 if (!fcount)
49 return -ENOMEM; 49 return -ENOMEM;
50 memset(fcount, 0, max * sizeof *fcount);
51 FILE_FCOUNT(file) = fcount; 50 FILE_FCOUNT(file) = fcount;
52 } 51 }
53 if (!page) { 52 if (!page) {
@@ -155,6 +154,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
155{ 154{
156 int err = 0; 155 int err = 0;
157 mtrr_type type; 156 mtrr_type type;
157 unsigned long size;
158 struct mtrr_sentry sentry; 158 struct mtrr_sentry sentry;
159 struct mtrr_gentry gentry; 159 struct mtrr_gentry gentry;
160 void __user *arg = (void __user *) __arg; 160 void __user *arg = (void __user *) __arg;
@@ -235,15 +235,15 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
235 case MTRRIOC_GET_ENTRY: 235 case MTRRIOC_GET_ENTRY:
236 if (gentry.regnum >= num_var_ranges) 236 if (gentry.regnum >= num_var_ranges)
237 return -EINVAL; 237 return -EINVAL;
238 mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); 238 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
239 239
240 /* Hide entries that go above 4GB */ 240 /* Hide entries that go above 4GB */
241 if (gentry.base + gentry.size > 0x100000 241 if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
242 || gentry.size == 0x100000) 242 || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
243 gentry.base = gentry.size = gentry.type = 0; 243 gentry.base = gentry.size = gentry.type = 0;
244 else { 244 else {
245 gentry.base <<= PAGE_SHIFT; 245 gentry.base <<= PAGE_SHIFT;
246 gentry.size <<= PAGE_SHIFT; 246 gentry.size = size << PAGE_SHIFT;
247 gentry.type = type; 247 gentry.type = type;
248 } 248 }
249 249
@@ -273,8 +273,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
273 case MTRRIOC_GET_PAGE_ENTRY: 273 case MTRRIOC_GET_PAGE_ENTRY:
274 if (gentry.regnum >= num_var_ranges) 274 if (gentry.regnum >= num_var_ranges)
275 return -EINVAL; 275 return -EINVAL;
276 mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); 276 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
277 gentry.type = type; 277 /* Hide entries that would overflow */
278 if (size != (__typeof__(gentry.size))size)
279 gentry.base = gentry.size = gentry.type = 0;
280 else {
281 gentry.size = size;
282 gentry.type = type;
283 }
278 break; 284 break;
279 } 285 }
280 286
@@ -353,8 +359,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
353 char factor; 359 char factor;
354 int i, max, len; 360 int i, max, len;
355 mtrr_type type; 361 mtrr_type type;
356 unsigned long base; 362 unsigned long base, size;
357 unsigned int size;
358 363
359 len = 0; 364 len = 0;
360 max = num_var_ranges; 365 max = num_var_ranges;
@@ -373,7 +378,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
373 } 378 }
374 /* RED-PEN: base can be > 32bit */ 379 /* RED-PEN: base can be > 32bit */
375 len += seq_printf(seq, 380 len += seq_printf(seq,
376 "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n", 381 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
377 i, base, base >> (20 - PAGE_SHIFT), size, factor, 382 i, base, base >> (20 - PAGE_SHIFT), size, factor,
378 mtrr_attrib_to_str(type), usage_table[i]); 383 mtrr_attrib_to_str(type), usage_table[i]);
379 } 384 }
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index fff90bda4733..16bb7ea87145 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -59,7 +59,11 @@ struct mtrr_ops * mtrr_if = NULL;
59static void set_mtrr(unsigned int reg, unsigned long base, 59static void set_mtrr(unsigned int reg, unsigned long base,
60 unsigned long size, mtrr_type type); 60 unsigned long size, mtrr_type type);
61 61
62#ifndef CONFIG_X86_64
62extern int arr3_protected; 63extern int arr3_protected;
64#else
65#define arr3_protected 0
66#endif
63 67
64void set_mtrr_ops(struct mtrr_ops * ops) 68void set_mtrr_ops(struct mtrr_ops * ops)
65{ 69{
@@ -168,6 +172,13 @@ static void ipi_handler(void *info)
168 172
169#endif 173#endif
170 174
175static inline int types_compatible(mtrr_type type1, mtrr_type type2) {
176 return type1 == MTRR_TYPE_UNCACHABLE ||
177 type2 == MTRR_TYPE_UNCACHABLE ||
178 (type1 == MTRR_TYPE_WRTHROUGH && type2 == MTRR_TYPE_WRBACK) ||
179 (type1 == MTRR_TYPE_WRBACK && type2 == MTRR_TYPE_WRTHROUGH);
180}
181
171/** 182/**
172 * set_mtrr - update mtrrs on all processors 183 * set_mtrr - update mtrrs on all processors
173 * @reg: mtrr in question 184 * @reg: mtrr in question
@@ -263,8 +274,8 @@ static void set_mtrr(unsigned int reg, unsigned long base,
263 274
264/** 275/**
265 * mtrr_add_page - Add a memory type region 276 * mtrr_add_page - Add a memory type region
266 * @base: Physical base address of region in pages (4 KB) 277 * @base: Physical base address of region in pages (in units of 4 kB!)
267 * @size: Physical size of region in pages (4 KB) 278 * @size: Physical size of region in pages (4 kB)
268 * @type: Type of MTRR desired 279 * @type: Type of MTRR desired
269 * @increment: If this is true do usage counting on the region 280 * @increment: If this is true do usage counting on the region
270 * 281 *
@@ -300,11 +311,9 @@ static void set_mtrr(unsigned int reg, unsigned long base,
300int mtrr_add_page(unsigned long base, unsigned long size, 311int mtrr_add_page(unsigned long base, unsigned long size,
301 unsigned int type, char increment) 312 unsigned int type, char increment)
302{ 313{
303 int i; 314 int i, replace, error;
304 mtrr_type ltype; 315 mtrr_type ltype;
305 unsigned long lbase; 316 unsigned long lbase, lsize;
306 unsigned int lsize;
307 int error;
308 317
309 if (!mtrr_if) 318 if (!mtrr_if)
310 return -ENXIO; 319 return -ENXIO;
@@ -324,12 +333,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
324 return -ENOSYS; 333 return -ENOSYS;
325 } 334 }
326 335
336 if (!size) {
337 printk(KERN_WARNING "mtrr: zero sized request\n");
338 return -EINVAL;
339 }
340
327 if (base & size_or_mask || size & size_or_mask) { 341 if (base & size_or_mask || size & size_or_mask) {
328 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n"); 342 printk(KERN_WARNING "mtrr: base or size exceeds the MTRR width\n");
329 return -EINVAL; 343 return -EINVAL;
330 } 344 }
331 345
332 error = -EINVAL; 346 error = -EINVAL;
347 replace = -1;
333 348
334 /* No CPU hotplug when we change MTRR entries */ 349 /* No CPU hotplug when we change MTRR entries */
335 lock_cpu_hotplug(); 350 lock_cpu_hotplug();
@@ -337,21 +352,28 @@ int mtrr_add_page(unsigned long base, unsigned long size,
337 mutex_lock(&mtrr_mutex); 352 mutex_lock(&mtrr_mutex);
338 for (i = 0; i < num_var_ranges; ++i) { 353 for (i = 0; i < num_var_ranges; ++i) {
339 mtrr_if->get(i, &lbase, &lsize, &ltype); 354 mtrr_if->get(i, &lbase, &lsize, &ltype);
340 if (base >= lbase + lsize) 355 if (!lsize || base > lbase + lsize - 1 || base + size - 1 < lbase)
341 continue;
342 if ((base < lbase) && (base + size <= lbase))
343 continue; 356 continue;
344 /* At this point we know there is some kind of overlap/enclosure */ 357 /* At this point we know there is some kind of overlap/enclosure */
345 if ((base < lbase) || (base + size > lbase + lsize)) { 358 if (base < lbase || base + size - 1 > lbase + lsize - 1) {
359 if (base <= lbase && base + size - 1 >= lbase + lsize - 1) {
360 /* New region encloses an existing region */
361 if (type == ltype) {
362 replace = replace == -1 ? i : -2;
363 continue;
364 }
365 else if (types_compatible(type, ltype))
366 continue;
367 }
346 printk(KERN_WARNING 368 printk(KERN_WARNING
347 "mtrr: 0x%lx000,0x%lx000 overlaps existing" 369 "mtrr: 0x%lx000,0x%lx000 overlaps existing"
348 " 0x%lx000,0x%x000\n", base, size, lbase, 370 " 0x%lx000,0x%lx000\n", base, size, lbase,
349 lsize); 371 lsize);
350 goto out; 372 goto out;
351 } 373 }
352 /* New region is enclosed by an existing region */ 374 /* New region is enclosed by an existing region */
353 if (ltype != type) { 375 if (ltype != type) {
354 if (type == MTRR_TYPE_UNCACHABLE) 376 if (types_compatible(type, ltype))
355 continue; 377 continue;
356 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", 378 printk (KERN_WARNING "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
357 base, size, mtrr_attrib_to_str(ltype), 379 base, size, mtrr_attrib_to_str(ltype),
@@ -364,10 +386,18 @@ int mtrr_add_page(unsigned long base, unsigned long size,
364 goto out; 386 goto out;
365 } 387 }
366 /* Search for an empty MTRR */ 388 /* Search for an empty MTRR */
367 i = mtrr_if->get_free_region(base, size); 389 i = mtrr_if->get_free_region(base, size, replace);
368 if (i >= 0) { 390 if (i >= 0) {
369 set_mtrr(i, base, size, type); 391 set_mtrr(i, base, size, type);
370 usage_table[i] = 1; 392 if (likely(replace < 0))
393 usage_table[i] = 1;
394 else {
395 usage_table[i] = usage_table[replace] + !!increment;
396 if (unlikely(replace != i)) {
397 set_mtrr(replace, 0, 0, 0);
398 usage_table[replace] = 0;
399 }
400 }
371 } else 401 } else
372 printk(KERN_INFO "mtrr: no more MTRRs available\n"); 402 printk(KERN_INFO "mtrr: no more MTRRs available\n");
373 error = i; 403 error = i;
@@ -455,8 +485,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
455{ 485{
456 int i, max; 486 int i, max;
457 mtrr_type ltype; 487 mtrr_type ltype;
458 unsigned long lbase; 488 unsigned long lbase, lsize;
459 unsigned int lsize;
460 int error = -EINVAL; 489 int error = -EINVAL;
461 490
462 if (!mtrr_if) 491 if (!mtrr_if)
@@ -544,9 +573,11 @@ extern void centaur_init_mtrr(void);
544 573
545static void __init init_ifs(void) 574static void __init init_ifs(void)
546{ 575{
576#ifndef CONFIG_X86_64
547 amd_init_mtrr(); 577 amd_init_mtrr();
548 cyrix_init_mtrr(); 578 cyrix_init_mtrr();
549 centaur_init_mtrr(); 579 centaur_init_mtrr();
580#endif
550} 581}
551 582
552/* The suspend/resume methods are only for CPU without MTRR. CPU using generic 583/* The suspend/resume methods are only for CPU without MTRR. CPU using generic
@@ -555,7 +586,7 @@ static void __init init_ifs(void)
555struct mtrr_value { 586struct mtrr_value {
556 mtrr_type ltype; 587 mtrr_type ltype;
557 unsigned long lbase; 588 unsigned long lbase;
558 unsigned int lsize; 589 unsigned long lsize;
559}; 590};
560 591
561static struct mtrr_value * mtrr_state; 592static struct mtrr_value * mtrr_state;
@@ -565,10 +596,8 @@ static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
565 int i; 596 int i;
566 int size = num_var_ranges * sizeof(struct mtrr_value); 597 int size = num_var_ranges * sizeof(struct mtrr_value);
567 598
568 mtrr_state = kmalloc(size,GFP_ATOMIC); 599 mtrr_state = kzalloc(size,GFP_ATOMIC);
569 if (mtrr_state) 600 if (!mtrr_state)
570 memset(mtrr_state,0,size);
571 else
572 return -ENOMEM; 601 return -ENOMEM;
573 602
574 for (i = 0; i < num_var_ranges; i++) { 603 for (i = 0; i < num_var_ranges; i++) {
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index 99c9f2682041..d61ea9db6cfe 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -43,15 +43,16 @@ struct mtrr_ops {
43 void (*set_all)(void); 43 void (*set_all)(void);
44 44
45 void (*get)(unsigned int reg, unsigned long *base, 45 void (*get)(unsigned int reg, unsigned long *base,
46 unsigned int *size, mtrr_type * type); 46 unsigned long *size, mtrr_type * type);
47 int (*get_free_region) (unsigned long base, unsigned long size); 47 int (*get_free_region)(unsigned long base, unsigned long size,
48 48 int replace_reg);
49 int (*validate_add_page)(unsigned long base, unsigned long size, 49 int (*validate_add_page)(unsigned long base, unsigned long size,
50 unsigned int type); 50 unsigned int type);
51 int (*have_wrcomb)(void); 51 int (*have_wrcomb)(void);
52}; 52};
53 53
54extern int generic_get_free_region(unsigned long base, unsigned long size); 54extern int generic_get_free_region(unsigned long base, unsigned long size,
55 int replace_reg);
55extern int generic_validate_add_page(unsigned long base, unsigned long size, 56extern int generic_validate_add_page(unsigned long base, unsigned long size,
56 unsigned int type); 57 unsigned int type);
57 58
@@ -62,17 +63,17 @@ extern int positive_have_wrcomb(void);
62/* library functions for processor-specific routines */ 63/* library functions for processor-specific routines */
63struct set_mtrr_context { 64struct set_mtrr_context {
64 unsigned long flags; 65 unsigned long flags;
65 unsigned long deftype_lo;
66 unsigned long deftype_hi;
67 unsigned long cr4val; 66 unsigned long cr4val;
68 unsigned long ccr3; 67 u32 deftype_lo;
68 u32 deftype_hi;
69 u32 ccr3;
69}; 70};
70 71
71struct mtrr_var_range { 72struct mtrr_var_range {
72 unsigned long base_lo; 73 u32 base_lo;
73 unsigned long base_hi; 74 u32 base_hi;
74 unsigned long mask_lo; 75 u32 mask_lo;
75 unsigned long mask_hi; 76 u32 mask_hi;
76}; 77};
77 78
78void set_mtrr_done(struct set_mtrr_context *ctxt); 79void set_mtrr_done(struct set_mtrr_context *ctxt);
@@ -92,6 +93,6 @@ extern struct mtrr_ops * mtrr_if;
92extern unsigned int num_var_ranges; 93extern unsigned int num_var_ranges;
93 94
94void mtrr_state_warn(void); 95void mtrr_state_warn(void);
95char *mtrr_attrib_to_str(int x); 96const char *mtrr_attrib_to_str(int x);
96void mtrr_wrmsr(unsigned, unsigned, unsigned); 97void mtrr_wrmsr(unsigned, unsigned, unsigned);
97 98
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 76aac088a323..6624d8583c42 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -152,9 +152,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
152 seq_printf(m, " [%d]", i); 152 seq_printf(m, " [%d]", i);
153 } 153 }
154 154
155 seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n", 155 seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
156 c->loops_per_jiffy/(500000/HZ), 156 c->loops_per_jiffy/(500000/HZ),
157 (c->loops_per_jiffy/(5000/HZ)) % 100); 157 (c->loops_per_jiffy/(5000/HZ)) % 100);
158 seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size);
158 159
159 return 0; 160 return 0;
160} 161}
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index fde8bea85cee..51130b39cd2e 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -34,7 +34,6 @@
34#include <linux/major.h> 34#include <linux/major.h>
35#include <linux/fs.h> 35#include <linux/fs.h>
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/fs.h>
38#include <linux/device.h> 37#include <linux/device.h>
39#include <linux/cpu.h> 38#include <linux/cpu.h>
40#include <linux/notifier.h> 39#include <linux/notifier.h>
@@ -117,7 +116,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
117 char __user *tmp = buf; 116 char __user *tmp = buf;
118 u32 data[4]; 117 u32 data[4];
119 u32 reg = *ppos; 118 u32 reg = *ppos;
120 int cpu = iminor(file->f_dentry->d_inode); 119 int cpu = iminor(file->f_path.dentry->d_inode);
121 120
122 if (count % 16) 121 if (count % 16)
123 return -EINVAL; /* Invalid chunk size */ 122 return -EINVAL; /* Invalid chunk size */
@@ -135,7 +134,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
135 134
136static int cpuid_open(struct inode *inode, struct file *file) 135static int cpuid_open(struct inode *inode, struct file *file)
137{ 136{
138 unsigned int cpu = iminor(file->f_dentry->d_inode); 137 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
139 struct cpuinfo_x86 *c = &(cpu_data)[cpu]; 138 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
140 139
141 if (cpu >= NR_CPUS || !cpu_online(cpu)) 140 if (cpu >= NR_CPUS || !cpu_online(cpu))
@@ -156,28 +155,27 @@ static struct file_operations cpuid_fops = {
156 .open = cpuid_open, 155 .open = cpuid_open,
157}; 156};
158 157
159static int cpuid_class_device_create(int i) 158static int cpuid_device_create(int i)
160{ 159{
161 int err = 0; 160 int err = 0;
162 struct class_device *class_err; 161 struct device *dev;
163 162
164 class_err = class_device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), NULL, "cpu%d",i); 163 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i);
165 if (IS_ERR(class_err)) 164 if (IS_ERR(dev))
166 err = PTR_ERR(class_err); 165 err = PTR_ERR(dev);
167 return err; 166 return err;
168} 167}
169 168
170#ifdef CONFIG_HOTPLUG_CPU
171static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 169static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
172{ 170{
173 unsigned int cpu = (unsigned long)hcpu; 171 unsigned int cpu = (unsigned long)hcpu;
174 172
175 switch (action) { 173 switch (action) {
176 case CPU_ONLINE: 174 case CPU_ONLINE:
177 cpuid_class_device_create(cpu); 175 cpuid_device_create(cpu);
178 break; 176 break;
179 case CPU_DEAD: 177 case CPU_DEAD:
180 class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); 178 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
181 break; 179 break;
182 } 180 }
183 return NOTIFY_OK; 181 return NOTIFY_OK;
@@ -187,7 +185,6 @@ static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier =
187{ 185{
188 .notifier_call = cpuid_class_cpu_callback, 186 .notifier_call = cpuid_class_cpu_callback,
189}; 187};
190#endif /* !CONFIG_HOTPLUG_CPU */
191 188
192static int __init cpuid_init(void) 189static int __init cpuid_init(void)
193{ 190{
@@ -206,7 +203,7 @@ static int __init cpuid_init(void)
206 goto out_chrdev; 203 goto out_chrdev;
207 } 204 }
208 for_each_online_cpu(i) { 205 for_each_online_cpu(i) {
209 err = cpuid_class_device_create(i); 206 err = cpuid_device_create(i);
210 if (err != 0) 207 if (err != 0)
211 goto out_class; 208 goto out_class;
212 } 209 }
@@ -218,7 +215,7 @@ static int __init cpuid_init(void)
218out_class: 215out_class:
219 i = 0; 216 i = 0;
220 for_each_online_cpu(i) { 217 for_each_online_cpu(i) {
221 class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i)); 218 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i));
222 } 219 }
223 class_destroy(cpuid_class); 220 class_destroy(cpuid_class);
224out_chrdev: 221out_chrdev:
@@ -232,7 +229,7 @@ static void __exit cpuid_exit(void)
232 int cpu = 0; 229 int cpu = 0;
233 230
234 for_each_online_cpu(cpu) 231 for_each_online_cpu(cpu)
235 class_device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); 232 device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
236 class_destroy(cpuid_class); 233 class_destroy(cpuid_class);
237 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 234 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid");
238 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 235 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 144b43288965..a5e0e990ea95 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -31,68 +31,6 @@
31/* This keeps a track of which one is crashing cpu. */ 31/* This keeps a track of which one is crashing cpu. */
32static int crashing_cpu; 32static int crashing_cpu;
33 33
34static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
35 size_t data_len)
36{
37 struct elf_note note;
38
39 note.n_namesz = strlen(name) + 1;
40 note.n_descsz = data_len;
41 note.n_type = type;
42 memcpy(buf, &note, sizeof(note));
43 buf += (sizeof(note) +3)/4;
44 memcpy(buf, name, note.n_namesz);
45 buf += (note.n_namesz + 3)/4;
46 memcpy(buf, data, note.n_descsz);
47 buf += (note.n_descsz + 3)/4;
48
49 return buf;
50}
51
52static void final_note(u32 *buf)
53{
54 struct elf_note note;
55
56 note.n_namesz = 0;
57 note.n_descsz = 0;
58 note.n_type = 0;
59 memcpy(buf, &note, sizeof(note));
60}
61
62static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
63{
64 struct elf_prstatus prstatus;
65 u32 *buf;
66
67 if ((cpu < 0) || (cpu >= NR_CPUS))
68 return;
69
70 /* Using ELF notes here is opportunistic.
71 * I need a well defined structure format
72 * for the data I pass, and I need tags
73 * on the data to indicate what information I have
74 * squirrelled away. ELF notes happen to provide
75 * all of that, so there is no need to invent something new.
76 */
77 buf = (u32*)per_cpu_ptr(crash_notes, cpu);
78 if (!buf)
79 return;
80 memset(&prstatus, 0, sizeof(prstatus));
81 prstatus.pr_pid = current->pid;
82 elf_core_copy_regs(&prstatus.pr_reg, regs);
83 buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
84 sizeof(prstatus));
85 final_note(buf);
86}
87
88static void crash_save_self(struct pt_regs *regs)
89{
90 int cpu;
91
92 cpu = safe_smp_processor_id();
93 crash_save_this_cpu(regs, cpu);
94}
95
96#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 34#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
97static atomic_t waiting_for_crash_ipi; 35static atomic_t waiting_for_crash_ipi;
98 36
@@ -121,7 +59,7 @@ static int crash_nmi_callback(struct notifier_block *self,
121 crash_fixup_ss_esp(&fixed_regs, regs); 59 crash_fixup_ss_esp(&fixed_regs, regs);
122 regs = &fixed_regs; 60 regs = &fixed_regs;
123 } 61 }
124 crash_save_this_cpu(regs, cpu); 62 crash_save_cpu(regs, cpu);
125 disable_local_APIC(); 63 disable_local_APIC();
126 atomic_dec(&waiting_for_crash_ipi); 64 atomic_dec(&waiting_for_crash_ipi);
127 /* Assume hlt works */ 65 /* Assume hlt works */
@@ -195,5 +133,5 @@ void machine_crash_shutdown(struct pt_regs *regs)
195#if defined(CONFIG_X86_IO_APIC) 133#if defined(CONFIG_X86_IO_APIC)
196 disable_IO_APIC(); 134 disable_IO_APIC();
197#endif 135#endif
198 crash_save_self(regs); 136 crash_save_cpu(regs, safe_smp_processor_id());
199} 137}
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
new file mode 100644
index 000000000000..f391abcf7da9
--- /dev/null
+++ b/arch/i386/kernel/e820.c
@@ -0,0 +1,894 @@
1#include <linux/kernel.h>
2#include <linux/types.h>
3#include <linux/init.h>
4#include <linux/bootmem.h>
5#include <linux/ioport.h>
6#include <linux/string.h>
7#include <linux/kexec.h>
8#include <linux/module.h>
9#include <linux/mm.h>
10#include <linux/efi.h>
11#include <linux/pfn.h>
12#include <linux/uaccess.h>
13
14#include <asm/pgtable.h>
15#include <asm/page.h>
16#include <asm/e820.h>
17
18#ifdef CONFIG_EFI
19int efi_enabled = 0;
20EXPORT_SYMBOL(efi_enabled);
21#endif
22
23struct e820map e820;
24struct change_member {
25 struct e820entry *pbios; /* pointer to original bios entry */
26 unsigned long long addr; /* address for this change point */
27};
28static struct change_member change_point_list[2*E820MAX] __initdata;
29static struct change_member *change_point[2*E820MAX] __initdata;
30static struct e820entry *overlap_list[E820MAX] __initdata;
31static struct e820entry new_bios[E820MAX] __initdata;
32/* For PCI or other memory-mapped resources */
33unsigned long pci_mem_start = 0x10000000;
34#ifdef CONFIG_PCI
35EXPORT_SYMBOL(pci_mem_start);
36#endif
37extern int user_defined_memmap;
38struct resource data_resource = {
39 .name = "Kernel data",
40 .start = 0,
41 .end = 0,
42 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
43};
44
45struct resource code_resource = {
46 .name = "Kernel code",
47 .start = 0,
48 .end = 0,
49 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
50};
51
52static struct resource system_rom_resource = {
53 .name = "System ROM",
54 .start = 0xf0000,
55 .end = 0xfffff,
56 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
57};
58
59static struct resource extension_rom_resource = {
60 .name = "Extension ROM",
61 .start = 0xe0000,
62 .end = 0xeffff,
63 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
64};
65
66static struct resource adapter_rom_resources[] = { {
67 .name = "Adapter ROM",
68 .start = 0xc8000,
69 .end = 0,
70 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
71}, {
72 .name = "Adapter ROM",
73 .start = 0,
74 .end = 0,
75 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
76}, {
77 .name = "Adapter ROM",
78 .start = 0,
79 .end = 0,
80 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
81}, {
82 .name = "Adapter ROM",
83 .start = 0,
84 .end = 0,
85 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
86}, {
87 .name = "Adapter ROM",
88 .start = 0,
89 .end = 0,
90 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
91}, {
92 .name = "Adapter ROM",
93 .start = 0,
94 .end = 0,
95 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
96} };
97
98static struct resource video_rom_resource = {
99 .name = "Video ROM",
100 .start = 0xc0000,
101 .end = 0xc7fff,
102 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
103};
104
105static struct resource video_ram_resource = {
106 .name = "Video RAM area",
107 .start = 0xa0000,
108 .end = 0xbffff,
109 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
110};
111
112static struct resource standard_io_resources[] = { {
113 .name = "dma1",
114 .start = 0x0000,
115 .end = 0x001f,
116 .flags = IORESOURCE_BUSY | IORESOURCE_IO
117}, {
118 .name = "pic1",
119 .start = 0x0020,
120 .end = 0x0021,
121 .flags = IORESOURCE_BUSY | IORESOURCE_IO
122}, {
123 .name = "timer0",
124 .start = 0x0040,
125 .end = 0x0043,
126 .flags = IORESOURCE_BUSY | IORESOURCE_IO
127}, {
128 .name = "timer1",
129 .start = 0x0050,
130 .end = 0x0053,
131 .flags = IORESOURCE_BUSY | IORESOURCE_IO
132}, {
133 .name = "keyboard",
134 .start = 0x0060,
135 .end = 0x006f,
136 .flags = IORESOURCE_BUSY | IORESOURCE_IO
137}, {
138 .name = "dma page reg",
139 .start = 0x0080,
140 .end = 0x008f,
141 .flags = IORESOURCE_BUSY | IORESOURCE_IO
142}, {
143 .name = "pic2",
144 .start = 0x00a0,
145 .end = 0x00a1,
146 .flags = IORESOURCE_BUSY | IORESOURCE_IO
147}, {
148 .name = "dma2",
149 .start = 0x00c0,
150 .end = 0x00df,
151 .flags = IORESOURCE_BUSY | IORESOURCE_IO
152}, {
153 .name = "fpu",
154 .start = 0x00f0,
155 .end = 0x00ff,
156 .flags = IORESOURCE_BUSY | IORESOURCE_IO
157} };
158
159static int romsignature(const unsigned char *x)
160{
161 unsigned short sig;
162 int ret = 0;
163 if (probe_kernel_address((const unsigned short *)x, sig) == 0)
164 ret = (sig == 0xaa55);
165 return ret;
166}
167
168static int __init romchecksum(unsigned char *rom, unsigned long length)
169{
170 unsigned char *p, sum = 0;
171
172 for (p = rom; p < rom + length; p++)
173 sum += *p;
174 return sum == 0;
175}
176
177static void __init probe_roms(void)
178{
179 unsigned long start, length, upper;
180 unsigned char *rom;
181 int i;
182
183 /* video rom */
184 upper = adapter_rom_resources[0].start;
185 for (start = video_rom_resource.start; start < upper; start += 2048) {
186 rom = isa_bus_to_virt(start);
187 if (!romsignature(rom))
188 continue;
189
190 video_rom_resource.start = start;
191
192 /* 0 < length <= 0x7f * 512, historically */
193 length = rom[2] * 512;
194
195 /* if checksum okay, trust length byte */
196 if (length && romchecksum(rom, length))
197 video_rom_resource.end = start + length - 1;
198
199 request_resource(&iomem_resource, &video_rom_resource);
200 break;
201 }
202
203 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
204 if (start < upper)
205 start = upper;
206
207 /* system rom */
208 request_resource(&iomem_resource, &system_rom_resource);
209 upper = system_rom_resource.start;
210
211 /* check for extension rom (ignore length byte!) */
212 rom = isa_bus_to_virt(extension_rom_resource.start);
213 if (romsignature(rom)) {
214 length = extension_rom_resource.end - extension_rom_resource.start + 1;
215 if (romchecksum(rom, length)) {
216 request_resource(&iomem_resource, &extension_rom_resource);
217 upper = extension_rom_resource.start;
218 }
219 }
220
221 /* check for adapter roms on 2k boundaries */
222 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
223 rom = isa_bus_to_virt(start);
224 if (!romsignature(rom))
225 continue;
226
227 /* 0 < length <= 0x7f * 512, historically */
228 length = rom[2] * 512;
229
230 /* but accept any length that fits if checksum okay */
231 if (!length || start + length > upper || !romchecksum(rom, length))
232 continue;
233
234 adapter_rom_resources[i].start = start;
235 adapter_rom_resources[i].end = start + length - 1;
236 request_resource(&iomem_resource, &adapter_rom_resources[i]);
237
238 start = adapter_rom_resources[i++].end & ~2047UL;
239 }
240}
241
242/*
243 * Request address space for all standard RAM and ROM resources
244 * and also for regions reported as reserved by the e820.
245 */
246static void __init
247legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
248{
249 int i;
250
251 probe_roms();
252 for (i = 0; i < e820.nr_map; i++) {
253 struct resource *res;
254#ifndef CONFIG_RESOURCES_64BIT
255 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
256 continue;
257#endif
258 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
259 switch (e820.map[i].type) {
260 case E820_RAM: res->name = "System RAM"; break;
261 case E820_ACPI: res->name = "ACPI Tables"; break;
262 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
263 default: res->name = "reserved";
264 }
265 res->start = e820.map[i].addr;
266 res->end = res->start + e820.map[i].size - 1;
267 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
268 if (request_resource(&iomem_resource, res)) {
269 kfree(res);
270 continue;
271 }
272 if (e820.map[i].type == E820_RAM) {
273 /*
274 * We don't know which RAM region contains kernel data,
275 * so we try it repeatedly and let the resource manager
276 * test it.
277 */
278 request_resource(res, code_resource);
279 request_resource(res, data_resource);
280#ifdef CONFIG_KEXEC
281 request_resource(res, &crashk_res);
282#endif
283 }
284 }
285}
286
287/*
288 * Request address space for all standard resources
289 *
290 * This is called just before pcibios_init(), which is also a
291 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
292 */
293static int __init request_standard_resources(void)
294{
295 int i;
296
297 printk("Setting up standard PCI resources\n");
298 if (efi_enabled)
299 efi_initialize_iomem_resources(&code_resource, &data_resource);
300 else
301 legacy_init_iomem_resources(&code_resource, &data_resource);
302
303 /* EFI systems may still have VGA */
304 request_resource(&iomem_resource, &video_ram_resource);
305
306 /* request I/O space for devices used on all i[345]86 PCs */
307 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
308 request_resource(&ioport_resource, &standard_io_resources[i]);
309 return 0;
310}
311
312subsys_initcall(request_standard_resources);
313
314void __init add_memory_region(unsigned long long start,
315 unsigned long long size, int type)
316{
317 int x;
318
319 if (!efi_enabled) {
320 x = e820.nr_map;
321
322 if (x == E820MAX) {
323 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
324 return;
325 }
326
327 e820.map[x].addr = start;
328 e820.map[x].size = size;
329 e820.map[x].type = type;
330 e820.nr_map++;
331 }
332} /* add_memory_region */
333
334/*
335 * Sanitize the BIOS e820 map.
336 *
337 * Some e820 responses include overlapping entries. The following
338 * replaces the original e820 map with a new one, removing overlaps.
339 *
340 */
341int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
342{
343 struct change_member *change_tmp;
344 unsigned long current_type, last_type;
345 unsigned long long last_addr;
346 int chgidx, still_changing;
347 int overlap_entries;
348 int new_bios_entry;
349 int old_nr, new_nr, chg_nr;
350 int i;
351
352 /*
353 Visually we're performing the following (1,2,3,4 = memory types)...
354
355 Sample memory map (w/overlaps):
356 ____22__________________
357 ______________________4_
358 ____1111________________
359 _44_____________________
360 11111111________________
361 ____________________33__
362 ___________44___________
363 __________33333_________
364 ______________22________
365 ___________________2222_
366 _________111111111______
367 _____________________11_
368 _________________4______
369
370 Sanitized equivalent (no overlap):
371 1_______________________
372 _44_____________________
373 ___1____________________
374 ____22__________________
375 ______11________________
376 _________1______________
377 __________3_____________
378 ___________44___________
379 _____________33_________
380 _______________2________
381 ________________1_______
382 _________________4______
383 ___________________2____
384 ____________________33__
385 ______________________4_
386 */
387 printk("sanitize start\n");
388 /* if there's only one memory region, don't bother */
389 if (*pnr_map < 2) {
390 printk("sanitize bail 0\n");
391 return -1;
392 }
393
394 old_nr = *pnr_map;
395
396 /* bail out if we find any unreasonable addresses in bios map */
397 for (i=0; i<old_nr; i++)
398 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) {
399 printk("sanitize bail 1\n");
400 return -1;
401 }
402
403 /* create pointers for initial change-point information (for sorting) */
404 for (i=0; i < 2*old_nr; i++)
405 change_point[i] = &change_point_list[i];
406
407 /* record all known change-points (starting and ending addresses),
408 omitting those that are for empty memory regions */
409 chgidx = 0;
410 for (i=0; i < old_nr; i++) {
411 if (biosmap[i].size != 0) {
412 change_point[chgidx]->addr = biosmap[i].addr;
413 change_point[chgidx++]->pbios = &biosmap[i];
414 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
415 change_point[chgidx++]->pbios = &biosmap[i];
416 }
417 }
418 chg_nr = chgidx; /* true number of change-points */
419
420 /* sort change-point list by memory addresses (low -> high) */
421 still_changing = 1;
422 while (still_changing) {
423 still_changing = 0;
424 for (i=1; i < chg_nr; i++) {
425 /* if <current_addr> > <last_addr>, swap */
426 /* or, if current=<start_addr> & last=<end_addr>, swap */
427 if ((change_point[i]->addr < change_point[i-1]->addr) ||
428 ((change_point[i]->addr == change_point[i-1]->addr) &&
429 (change_point[i]->addr == change_point[i]->pbios->addr) &&
430 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
431 )
432 {
433 change_tmp = change_point[i];
434 change_point[i] = change_point[i-1];
435 change_point[i-1] = change_tmp;
436 still_changing=1;
437 }
438 }
439 }
440
441 /* create a new bios memory map, removing overlaps */
442 overlap_entries=0; /* number of entries in the overlap table */
443 new_bios_entry=0; /* index for creating new bios map entries */
444 last_type = 0; /* start with undefined memory type */
445 last_addr = 0; /* start with 0 as last starting address */
446 /* loop through change-points, determining affect on the new bios map */
447 for (chgidx=0; chgidx < chg_nr; chgidx++)
448 {
449 /* keep track of all overlapping bios entries */
450 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
451 {
452 /* add map entry to overlap list (> 1 entry implies an overlap) */
453 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
454 }
455 else
456 {
457 /* remove entry from list (order independent, so swap with last) */
458 for (i=0; i<overlap_entries; i++)
459 {
460 if (overlap_list[i] == change_point[chgidx]->pbios)
461 overlap_list[i] = overlap_list[overlap_entries-1];
462 }
463 overlap_entries--;
464 }
465 /* if there are overlapping entries, decide which "type" to use */
466 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
467 current_type = 0;
468 for (i=0; i<overlap_entries; i++)
469 if (overlap_list[i]->type > current_type)
470 current_type = overlap_list[i]->type;
471 /* continue building up new bios map based on this information */
472 if (current_type != last_type) {
473 if (last_type != 0) {
474 new_bios[new_bios_entry].size =
475 change_point[chgidx]->addr - last_addr;
476 /* move forward only if the new size was non-zero */
477 if (new_bios[new_bios_entry].size != 0)
478 if (++new_bios_entry >= E820MAX)
479 break; /* no more space left for new bios entries */
480 }
481 if (current_type != 0) {
482 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
483 new_bios[new_bios_entry].type = current_type;
484 last_addr=change_point[chgidx]->addr;
485 }
486 last_type = current_type;
487 }
488 }
489 new_nr = new_bios_entry; /* retain count for new bios entries */
490
491 /* copy new bios mapping into original location */
492 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
493 *pnr_map = new_nr;
494
495 printk("sanitize end\n");
496 return 0;
497}
498
499/*
500 * Copy the BIOS e820 map into a safe place.
501 *
502 * Sanity-check it while we're at it..
503 *
504 * If we're lucky and live on a modern system, the setup code
505 * will have given us a memory map that we can use to properly
506 * set up memory. If we aren't, we'll fake a memory map.
507 *
508 * We check to see that the memory map contains at least 2 elements
509 * before we'll use it, because the detection code in setup.S may
510 * not be perfect and most every PC known to man has two memory
511 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
512 * thinkpad 560x, for example, does not cooperate with the memory
513 * detection code.)
514 */
515int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
516{
517 /* Only one memory region (or negative)? Ignore it */
518 if (nr_map < 2)
519 return -1;
520
521 do {
522 unsigned long long start = biosmap->addr;
523 unsigned long long size = biosmap->size;
524 unsigned long long end = start + size;
525 unsigned long type = biosmap->type;
526 printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type);
527
528 /* Overflow in 64 bits? Ignore the memory map. */
529 if (start > end)
530 return -1;
531
532 /*
533 * Some BIOSes claim RAM in the 640k - 1M region.
534 * Not right. Fix it up.
535 */
536 if (type == E820_RAM) {
537 printk("copy_e820_map() type is E820_RAM\n");
538 if (start < 0x100000ULL && end > 0xA0000ULL) {
539 printk("copy_e820_map() lies in range...\n");
540 if (start < 0xA0000ULL) {
541 printk("copy_e820_map() start < 0xA0000ULL\n");
542 add_memory_region(start, 0xA0000ULL-start, type);
543 }
544 if (end <= 0x100000ULL) {
545 printk("copy_e820_map() end <= 0x100000ULL\n");
546 continue;
547 }
548 start = 0x100000ULL;
549 size = end - start;
550 }
551 }
552 add_memory_region(start, size, type);
553 } while (biosmap++,--nr_map);
554 return 0;
555}
556
557/*
558 * Callback for efi_memory_walk.
559 */
560static int __init
561efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
562{
563 unsigned long *max_pfn = arg, pfn;
564
565 if (start < end) {
566 pfn = PFN_UP(end -1);
567 if (pfn > *max_pfn)
568 *max_pfn = pfn;
569 }
570 return 0;
571}
572
573static int __init
574efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
575{
576 memory_present(0, PFN_UP(start), PFN_DOWN(end));
577 return 0;
578}
579
580/*
581 * Find the highest page frame number we have available
582 */
583void __init find_max_pfn(void)
584{
585 int i;
586
587 max_pfn = 0;
588 if (efi_enabled) {
589 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
590 efi_memmap_walk(efi_memory_present_wrapper, NULL);
591 return;
592 }
593
594 for (i = 0; i < e820.nr_map; i++) {
595 unsigned long start, end;
596 /* RAM? */
597 if (e820.map[i].type != E820_RAM)
598 continue;
599 start = PFN_UP(e820.map[i].addr);
600 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
601 if (start >= end)
602 continue;
603 if (end > max_pfn)
604 max_pfn = end;
605 memory_present(0, start, end);
606 }
607}
608
609/*
610 * Free all available memory for boot time allocation. Used
611 * as a callback function by efi_memory_walk()
612 */
613
614static int __init
615free_available_memory(unsigned long start, unsigned long end, void *arg)
616{
617 /* check max_low_pfn */
618 if (start >= (max_low_pfn << PAGE_SHIFT))
619 return 0;
620 if (end >= (max_low_pfn << PAGE_SHIFT))
621 end = max_low_pfn << PAGE_SHIFT;
622 if (start < end)
623 free_bootmem(start, end - start);
624
625 return 0;
626}
627/*
628 * Register fully available low RAM pages with the bootmem allocator.
629 */
630void __init register_bootmem_low_pages(unsigned long max_low_pfn)
631{
632 int i;
633
634 if (efi_enabled) {
635 efi_memmap_walk(free_available_memory, NULL);
636 return;
637 }
638 for (i = 0; i < e820.nr_map; i++) {
639 unsigned long curr_pfn, last_pfn, size;
640 /*
641 * Reserve usable low memory
642 */
643 if (e820.map[i].type != E820_RAM)
644 continue;
645 /*
646 * We are rounding up the start address of usable memory:
647 */
648 curr_pfn = PFN_UP(e820.map[i].addr);
649 if (curr_pfn >= max_low_pfn)
650 continue;
651 /*
652 * ... and at the end of the usable range downwards:
653 */
654 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
655
656 if (last_pfn > max_low_pfn)
657 last_pfn = max_low_pfn;
658
659 /*
660 * .. finally, did all the rounding and playing
661 * around just make the area go away?
662 */
663 if (last_pfn <= curr_pfn)
664 continue;
665
666 size = last_pfn - curr_pfn;
667 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
668 }
669}
670
671void __init e820_register_memory(void)
672{
673 unsigned long gapstart, gapsize, round;
674 unsigned long long last;
675 int i;
676
677 /*
678 * Search for the bigest gap in the low 32 bits of the e820
679 * memory space.
680 */
681 last = 0x100000000ull;
682 gapstart = 0x10000000;
683 gapsize = 0x400000;
684 i = e820.nr_map;
685 while (--i >= 0) {
686 unsigned long long start = e820.map[i].addr;
687 unsigned long long end = start + e820.map[i].size;
688
689 /*
690 * Since "last" is at most 4GB, we know we'll
691 * fit in 32 bits if this condition is true
692 */
693 if (last > end) {
694 unsigned long gap = last - end;
695
696 if (gap > gapsize) {
697 gapsize = gap;
698 gapstart = end;
699 }
700 }
701 if (start < last)
702 last = start;
703 }
704
705 /*
706 * See how much we want to round up: start off with
707 * rounding to the next 1MB area.
708 */
709 round = 0x100000;
710 while ((gapsize >> 4) > round)
711 round += round;
712 /* Fun with two's complement */
713 pci_mem_start = (gapstart + round) & -round;
714
715 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
716 pci_mem_start, gapstart, gapsize);
717}
718
719void __init print_memory_map(char *who)
720{
721 int i;
722
723 for (i = 0; i < e820.nr_map; i++) {
724 printk(" %s: %016Lx - %016Lx ", who,
725 e820.map[i].addr,
726 e820.map[i].addr + e820.map[i].size);
727 switch (e820.map[i].type) {
728 case E820_RAM: printk("(usable)\n");
729 break;
730 case E820_RESERVED:
731 printk("(reserved)\n");
732 break;
733 case E820_ACPI:
734 printk("(ACPI data)\n");
735 break;
736 case E820_NVS:
737 printk("(ACPI NVS)\n");
738 break;
739 default: printk("type %lu\n", e820.map[i].type);
740 break;
741 }
742 }
743}
744
745static __init __always_inline void efi_limit_regions(unsigned long long size)
746{
747 unsigned long long current_addr = 0;
748 efi_memory_desc_t *md, *next_md;
749 void *p, *p1;
750 int i, j;
751
752 j = 0;
753 p1 = memmap.map;
754 for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
755 md = p;
756 next_md = p1;
757 current_addr = md->phys_addr +
758 PFN_PHYS(md->num_pages);
759 if (is_available_memory(md)) {
760 if (md->phys_addr >= size) continue;
761 memcpy(next_md, md, memmap.desc_size);
762 if (current_addr >= size) {
763 next_md->num_pages -=
764 PFN_UP(current_addr-size);
765 }
766 p1 += memmap.desc_size;
767 next_md = p1;
768 j++;
769 } else if ((md->attribute & EFI_MEMORY_RUNTIME) ==
770 EFI_MEMORY_RUNTIME) {
771 /* In order to make runtime services
772 * available we have to include runtime
773 * memory regions in memory map */
774 memcpy(next_md, md, memmap.desc_size);
775 p1 += memmap.desc_size;
776 next_md = p1;
777 j++;
778 }
779 }
780 memmap.nr_map = j;
781 memmap.map_end = memmap.map +
782 (memmap.nr_map * memmap.desc_size);
783}
784
785void __init limit_regions(unsigned long long size)
786{
787 unsigned long long current_addr;
788 int i;
789
790 print_memory_map("limit_regions start");
791 if (efi_enabled) {
792 efi_limit_regions(size);
793 return;
794 }
795 for (i = 0; i < e820.nr_map; i++) {
796 current_addr = e820.map[i].addr + e820.map[i].size;
797 if (current_addr < size)
798 continue;
799
800 if (e820.map[i].type != E820_RAM)
801 continue;
802
803 if (e820.map[i].addr >= size) {
804 /*
805 * This region starts past the end of the
806 * requested size, skip it completely.
807 */
808 e820.nr_map = i;
809 } else {
810 e820.nr_map = i + 1;
811 e820.map[i].size -= current_addr - size;
812 }
813 print_memory_map("limit_regions endfor");
814 return;
815 }
816 print_memory_map("limit_regions endfunc");
817}
818
819 /*
820 * This function checks if the entire range <start,end> is mapped with type.
821 *
822 * Note: this function only works correct if the e820 table is sorted and
823 * not-overlapping, which is the case
824 */
825int __init
826e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
827{
828 u64 start = s;
829 u64 end = e;
830 int i;
831 for (i = 0; i < e820.nr_map; i++) {
832 struct e820entry *ei = &e820.map[i];
833 if (type && ei->type != type)
834 continue;
835 /* is the region (part) in overlap with the current region ?*/
836 if (ei->addr >= end || ei->addr + ei->size <= start)
837 continue;
838 /* if the region is at the beginning of <start,end> we move
839 * start to the end of the region since it's ok until there
840 */
841 if (ei->addr <= start)
842 start = ei->addr + ei->size;
843 /* if start is now at or beyond end, we're done, full
844 * coverage */
845 if (start >= end)
846 return 1; /* we're done */
847 }
848 return 0;
849}
850
851static int __init parse_memmap(char *arg)
852{
853 if (!arg)
854 return -EINVAL;
855
856 if (strcmp(arg, "exactmap") == 0) {
857#ifdef CONFIG_CRASH_DUMP
858 /* If we are doing a crash dump, we
859 * still need to know the real mem
860 * size before original memory map is
861 * reset.
862 */
863 find_max_pfn();
864 saved_max_pfn = max_pfn;
865#endif
866 e820.nr_map = 0;
867 user_defined_memmap = 1;
868 } else {
869 /* If the user specifies memory size, we
870 * limit the BIOS-provided memory map to
871 * that size. exactmap can be used to specify
872 * the exact map. mem=number can be used to
873 * trim the existing memory map.
874 */
875 unsigned long long start_at, mem_size;
876
877 mem_size = memparse(arg, &arg);
878 if (*arg == '@') {
879 start_at = memparse(arg+1, &arg);
880 add_memory_region(start_at, mem_size, E820_RAM);
881 } else if (*arg == '#') {
882 start_at = memparse(arg+1, &arg);
883 add_memory_region(start_at, mem_size, E820_ACPI);
884 } else if (*arg == '$') {
885 start_at = memparse(arg+1, &arg);
886 add_memory_region(start_at, mem_size, E820_RESERVED);
887 } else {
888 limit_regions(mem_size);
889 user_defined_memmap = 1;
890 }
891 }
892 return 0;
893}
894early_param("memmap", parse_memmap);
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 8b40648d0ef0..b92c7f0a358a 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -194,17 +194,24 @@ inline int efi_set_rtc_mmss(unsigned long nowtime)
194 return 0; 194 return 0;
195} 195}
196/* 196/*
197 * This should only be used during kernel init and before runtime 197 * This is used during kernel init before runtime
198 * services have been remapped, therefore, we'll need to call in physical 198 * services have been remapped and also during suspend, therefore,
199 * mode. Note, this call isn't used later, so mark it __init. 199 * we'll need to call both in physical and virtual modes.
200 */ 200 */
201inline unsigned long __init efi_get_time(void) 201inline unsigned long efi_get_time(void)
202{ 202{
203 efi_status_t status; 203 efi_status_t status;
204 efi_time_t eft; 204 efi_time_t eft;
205 efi_time_cap_t cap; 205 efi_time_cap_t cap;
206 206
207 status = phys_efi_get_time(&eft, &cap); 207 if (efi.get_time) {
208 /* if we are in virtual mode use remapped function */
209 status = efi.get_time(&eft, &cap);
210 } else {
211 /* we are in physical mode */
212 status = phys_efi_get_time(&eft, &cap);
213 }
214
208 if (status != EFI_SUCCESS) 215 if (status != EFI_SUCCESS)
209 printk("Oops: efitime: can't read time status: 0x%lx\n",status); 216 printk("Oops: efitime: can't read time status: 0x%lx\n",status);
210 217
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5a63d6fdb70e..06461b8b715d 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -30,12 +30,13 @@
30 * 18(%esp) - %eax 30 * 18(%esp) - %eax
31 * 1C(%esp) - %ds 31 * 1C(%esp) - %ds
32 * 20(%esp) - %es 32 * 20(%esp) - %es
33 * 24(%esp) - orig_eax 33 * 24(%esp) - %gs
34 * 28(%esp) - %eip 34 * 28(%esp) - orig_eax
35 * 2C(%esp) - %cs 35 * 2C(%esp) - %eip
36 * 30(%esp) - %eflags 36 * 30(%esp) - %cs
37 * 34(%esp) - %oldesp 37 * 34(%esp) - %eflags
38 * 38(%esp) - %oldss 38 * 38(%esp) - %oldesp
39 * 3C(%esp) - %oldss
39 * 40 *
40 * "current" is in register %ebx during any slow entries. 41 * "current" is in register %ebx during any slow entries.
41 */ 42 */
@@ -48,26 +49,24 @@
48#include <asm/smp.h> 49#include <asm/smp.h>
49#include <asm/page.h> 50#include <asm/page.h>
50#include <asm/desc.h> 51#include <asm/desc.h>
52#include <asm/percpu.h>
51#include <asm/dwarf2.h> 53#include <asm/dwarf2.h>
52#include "irq_vectors.h" 54#include "irq_vectors.h"
53 55
54#define nr_syscalls ((syscall_table_size)/4) 56/*
57 * We use macros for low-level operations which need to be overridden
58 * for paravirtualization. The following will never clobber any registers:
59 * INTERRUPT_RETURN (aka. "iret")
60 * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
61 * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
62 *
63 * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
64 * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
65 * Allowing a register to be clobbered can shrink the paravirt replacement
66 * enough to patch inline, increasing performance.
67 */
55 68
56EBX = 0x00 69#define nr_syscalls ((syscall_table_size)/4)
57ECX = 0x04
58EDX = 0x08
59ESI = 0x0C
60EDI = 0x10
61EBP = 0x14
62EAX = 0x18
63DS = 0x1C
64ES = 0x20
65ORIG_EAX = 0x24
66EIP = 0x28
67CS = 0x2C
68EFLAGS = 0x30
69OLDESP = 0x34
70OLDSS = 0x38
71 70
72CF_MASK = 0x00000001 71CF_MASK = 0x00000001
73TF_MASK = 0x00000100 72TF_MASK = 0x00000100
@@ -76,23 +75,16 @@ DF_MASK = 0x00000400
76NT_MASK = 0x00004000 75NT_MASK = 0x00004000
77VM_MASK = 0x00020000 76VM_MASK = 0x00020000
78 77
79/* These are replaces for paravirtualization */
80#define DISABLE_INTERRUPTS cli
81#define ENABLE_INTERRUPTS sti
82#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
83#define INTERRUPT_RETURN iret
84#define GET_CR0_INTO_EAX movl %cr0, %eax
85
86#ifdef CONFIG_PREEMPT 78#ifdef CONFIG_PREEMPT
87#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF 79#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
88#else 80#else
89#define preempt_stop 81#define preempt_stop(clobbers)
90#define resume_kernel restore_nocheck 82#define resume_kernel restore_nocheck
91#endif 83#endif
92 84
93.macro TRACE_IRQS_IRET 85.macro TRACE_IRQS_IRET
94#ifdef CONFIG_TRACE_IRQFLAGS 86#ifdef CONFIG_TRACE_IRQFLAGS
95 testl $IF_MASK,EFLAGS(%esp) # interrupts off? 87 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
96 jz 1f 88 jz 1f
97 TRACE_IRQS_ON 89 TRACE_IRQS_ON
981: 901:
@@ -107,6 +99,9 @@ VM_MASK = 0x00020000
107 99
108#define SAVE_ALL \ 100#define SAVE_ALL \
109 cld; \ 101 cld; \
102 pushl %gs; \
103 CFI_ADJUST_CFA_OFFSET 4;\
104 /*CFI_REL_OFFSET gs, 0;*/\
110 pushl %es; \ 105 pushl %es; \
111 CFI_ADJUST_CFA_OFFSET 4;\ 106 CFI_ADJUST_CFA_OFFSET 4;\
112 /*CFI_REL_OFFSET es, 0;*/\ 107 /*CFI_REL_OFFSET es, 0;*/\
@@ -136,7 +131,9 @@ VM_MASK = 0x00020000
136 CFI_REL_OFFSET ebx, 0;\ 131 CFI_REL_OFFSET ebx, 0;\
137 movl $(__USER_DS), %edx; \ 132 movl $(__USER_DS), %edx; \
138 movl %edx, %ds; \ 133 movl %edx, %ds; \
139 movl %edx, %es; 134 movl %edx, %es; \
135 movl $(__KERNEL_PDA), %edx; \
136 movl %edx, %gs
140 137
141#define RESTORE_INT_REGS \ 138#define RESTORE_INT_REGS \
142 popl %ebx; \ 139 popl %ebx; \
@@ -169,17 +166,22 @@ VM_MASK = 0x00020000
1692: popl %es; \ 1662: popl %es; \
170 CFI_ADJUST_CFA_OFFSET -4;\ 167 CFI_ADJUST_CFA_OFFSET -4;\
171 /*CFI_RESTORE es;*/\ 168 /*CFI_RESTORE es;*/\
172.section .fixup,"ax"; \ 1693: popl %gs; \
1733: movl $0,(%esp); \ 170 CFI_ADJUST_CFA_OFFSET -4;\
174 jmp 1b; \ 171 /*CFI_RESTORE gs;*/\
172.pushsection .fixup,"ax"; \
1754: movl $0,(%esp); \ 1734: movl $0,(%esp); \
174 jmp 1b; \
1755: movl $0,(%esp); \
176 jmp 2b; \ 176 jmp 2b; \
177.previous; \ 1776: movl $0,(%esp); \
178 jmp 3b; \
178.section __ex_table,"a";\ 179.section __ex_table,"a";\
179 .align 4; \ 180 .align 4; \
180 .long 1b,3b; \ 181 .long 1b,4b; \
181 .long 2b,4b; \ 182 .long 2b,5b; \
182.previous 183 .long 3b,6b; \
184.popsection
183 185
184#define RING0_INT_FRAME \ 186#define RING0_INT_FRAME \
185 CFI_STARTPROC simple;\ 187 CFI_STARTPROC simple;\
@@ -198,18 +200,18 @@ VM_MASK = 0x00020000
198#define RING0_PTREGS_FRAME \ 200#define RING0_PTREGS_FRAME \
199 CFI_STARTPROC simple;\ 201 CFI_STARTPROC simple;\
200 CFI_SIGNAL_FRAME;\ 202 CFI_SIGNAL_FRAME;\
201 CFI_DEF_CFA esp, OLDESP-EBX;\ 203 CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\
202 /*CFI_OFFSET cs, CS-OLDESP;*/\ 204 /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\
203 CFI_OFFSET eip, EIP-OLDESP;\ 205 CFI_OFFSET eip, PT_EIP-PT_OLDESP;\
204 /*CFI_OFFSET es, ES-OLDESP;*/\ 206 /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\
205 /*CFI_OFFSET ds, DS-OLDESP;*/\ 207 /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\
206 CFI_OFFSET eax, EAX-OLDESP;\ 208 CFI_OFFSET eax, PT_EAX-PT_OLDESP;\
207 CFI_OFFSET ebp, EBP-OLDESP;\ 209 CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\
208 CFI_OFFSET edi, EDI-OLDESP;\ 210 CFI_OFFSET edi, PT_EDI-PT_OLDESP;\
209 CFI_OFFSET esi, ESI-OLDESP;\ 211 CFI_OFFSET esi, PT_ESI-PT_OLDESP;\
210 CFI_OFFSET edx, EDX-OLDESP;\ 212 CFI_OFFSET edx, PT_EDX-PT_OLDESP;\
211 CFI_OFFSET ecx, ECX-OLDESP;\ 213 CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\
212 CFI_OFFSET ebx, EBX-OLDESP 214 CFI_OFFSET ebx, PT_EBX-PT_OLDESP
213 215
214ENTRY(ret_from_fork) 216ENTRY(ret_from_fork)
215 CFI_STARTPROC 217 CFI_STARTPROC
@@ -237,17 +239,18 @@ ENTRY(ret_from_fork)
237 ALIGN 239 ALIGN
238 RING0_PTREGS_FRAME 240 RING0_PTREGS_FRAME
239ret_from_exception: 241ret_from_exception:
240 preempt_stop 242 preempt_stop(CLBR_ANY)
241ret_from_intr: 243ret_from_intr:
242 GET_THREAD_INFO(%ebp) 244 GET_THREAD_INFO(%ebp)
243check_userspace: 245check_userspace:
244 movl EFLAGS(%esp), %eax # mix EFLAGS and CS 246 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
245 movb CS(%esp), %al 247 movb PT_CS(%esp), %al
246 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax 248 andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
247 cmpl $USER_RPL, %eax 249 cmpl $USER_RPL, %eax
248 jb resume_kernel # not returning to v8086 or userspace 250 jb resume_kernel # not returning to v8086 or userspace
251
249ENTRY(resume_userspace) 252ENTRY(resume_userspace)
250 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 253 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
251 # setting need_resched or sigpending 254 # setting need_resched or sigpending
252 # between sampling and the iret 255 # between sampling and the iret
253 movl TI_flags(%ebp), %ecx 256 movl TI_flags(%ebp), %ecx
@@ -258,14 +261,14 @@ ENTRY(resume_userspace)
258 261
259#ifdef CONFIG_PREEMPT 262#ifdef CONFIG_PREEMPT
260ENTRY(resume_kernel) 263ENTRY(resume_kernel)
261 DISABLE_INTERRUPTS 264 DISABLE_INTERRUPTS(CLBR_ANY)
262 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? 265 cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
263 jnz restore_nocheck 266 jnz restore_nocheck
264need_resched: 267need_resched:
265 movl TI_flags(%ebp), %ecx # need_resched set ? 268 movl TI_flags(%ebp), %ecx # need_resched set ?
266 testb $_TIF_NEED_RESCHED, %cl 269 testb $_TIF_NEED_RESCHED, %cl
267 jz restore_all 270 jz restore_all
268 testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? 271 testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
269 jz restore_all 272 jz restore_all
270 call preempt_schedule_irq 273 call preempt_schedule_irq
271 jmp need_resched 274 jmp need_resched
@@ -287,7 +290,7 @@ sysenter_past_esp:
287 * No need to follow this irqs on/off section: the syscall 290 * No need to follow this irqs on/off section: the syscall
288 * disabled irqs and here we enable it straight after entry: 291 * disabled irqs and here we enable it straight after entry:
289 */ 292 */
290 ENABLE_INTERRUPTS 293 ENABLE_INTERRUPTS(CLBR_NONE)
291 pushl $(__USER_DS) 294 pushl $(__USER_DS)
292 CFI_ADJUST_CFA_OFFSET 4 295 CFI_ADJUST_CFA_OFFSET 4
293 /*CFI_REL_OFFSET ss, 0*/ 296 /*CFI_REL_OFFSET ss, 0*/
@@ -331,20 +334,27 @@ sysenter_past_esp:
331 cmpl $(nr_syscalls), %eax 334 cmpl $(nr_syscalls), %eax
332 jae syscall_badsys 335 jae syscall_badsys
333 call *sys_call_table(,%eax,4) 336 call *sys_call_table(,%eax,4)
334 movl %eax,EAX(%esp) 337 movl %eax,PT_EAX(%esp)
335 DISABLE_INTERRUPTS 338 DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
336 TRACE_IRQS_OFF 339 TRACE_IRQS_OFF
337 movl TI_flags(%ebp), %ecx 340 movl TI_flags(%ebp), %ecx
338 testw $_TIF_ALLWORK_MASK, %cx 341 testw $_TIF_ALLWORK_MASK, %cx
339 jne syscall_exit_work 342 jne syscall_exit_work
340/* if something modifies registers it must also disable sysexit */ 343/* if something modifies registers it must also disable sysexit */
341 movl EIP(%esp), %edx 344 movl PT_EIP(%esp), %edx
342 movl OLDESP(%esp), %ecx 345 movl PT_OLDESP(%esp), %ecx
343 xorl %ebp,%ebp 346 xorl %ebp,%ebp
344 TRACE_IRQS_ON 347 TRACE_IRQS_ON
3481: mov PT_GS(%esp), %gs
345 ENABLE_INTERRUPTS_SYSEXIT 349 ENABLE_INTERRUPTS_SYSEXIT
346 CFI_ENDPROC 350 CFI_ENDPROC
347 351.pushsection .fixup,"ax"
3522: movl $0,PT_GS(%esp)
353 jmp 1b
354.section __ex_table,"a"
355 .align 4
356 .long 1b,2b
357.popsection
348 358
349 # system call handler stub 359 # system call handler stub
350ENTRY(system_call) 360ENTRY(system_call)
@@ -353,7 +363,7 @@ ENTRY(system_call)
353 CFI_ADJUST_CFA_OFFSET 4 363 CFI_ADJUST_CFA_OFFSET 4
354 SAVE_ALL 364 SAVE_ALL
355 GET_THREAD_INFO(%ebp) 365 GET_THREAD_INFO(%ebp)
356 testl $TF_MASK,EFLAGS(%esp) 366 testl $TF_MASK,PT_EFLAGS(%esp)
357 jz no_singlestep 367 jz no_singlestep
358 orl $_TIF_SINGLESTEP,TI_flags(%ebp) 368 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
359no_singlestep: 369no_singlestep:
@@ -365,9 +375,9 @@ no_singlestep:
365 jae syscall_badsys 375 jae syscall_badsys
366syscall_call: 376syscall_call:
367 call *sys_call_table(,%eax,4) 377 call *sys_call_table(,%eax,4)
368 movl %eax,EAX(%esp) # store the return value 378 movl %eax,PT_EAX(%esp) # store the return value
369syscall_exit: 379syscall_exit:
370 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 380 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
371 # setting need_resched or sigpending 381 # setting need_resched or sigpending
372 # between sampling and the iret 382 # between sampling and the iret
373 TRACE_IRQS_OFF 383 TRACE_IRQS_OFF
@@ -376,12 +386,12 @@ syscall_exit:
376 jne syscall_exit_work 386 jne syscall_exit_work
377 387
378restore_all: 388restore_all:
379 movl EFLAGS(%esp), %eax # mix EFLAGS, SS and CS 389 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
380 # Warning: OLDSS(%esp) contains the wrong/random values if we 390 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
381 # are returning to the kernel. 391 # are returning to the kernel.
382 # See comments in process.c:copy_thread() for details. 392 # See comments in process.c:copy_thread() for details.
383 movb OLDSS(%esp), %ah 393 movb PT_OLDSS(%esp), %ah
384 movb CS(%esp), %al 394 movb PT_CS(%esp), %al
385 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax 395 andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
386 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax 396 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
387 CFI_REMEMBER_STATE 397 CFI_REMEMBER_STATE
@@ -390,13 +400,13 @@ restore_nocheck:
390 TRACE_IRQS_IRET 400 TRACE_IRQS_IRET
391restore_nocheck_notrace: 401restore_nocheck_notrace:
392 RESTORE_REGS 402 RESTORE_REGS
393 addl $4, %esp 403 addl $4, %esp # skip orig_eax/error_code
394 CFI_ADJUST_CFA_OFFSET -4 404 CFI_ADJUST_CFA_OFFSET -4
3951: INTERRUPT_RETURN 4051: INTERRUPT_RETURN
396.section .fixup,"ax" 406.section .fixup,"ax"
397iret_exc: 407iret_exc:
398 TRACE_IRQS_ON 408 TRACE_IRQS_ON
399 ENABLE_INTERRUPTS 409 ENABLE_INTERRUPTS(CLBR_NONE)
400 pushl $0 # no error code 410 pushl $0 # no error code
401 pushl $do_iret_error 411 pushl $do_iret_error
402 jmp error_code 412 jmp error_code
@@ -408,33 +418,42 @@ iret_exc:
408 418
409 CFI_RESTORE_STATE 419 CFI_RESTORE_STATE
410ldt_ss: 420ldt_ss:
411 larl OLDSS(%esp), %eax 421 larl PT_OLDSS(%esp), %eax
412 jnz restore_nocheck 422 jnz restore_nocheck
413 testl $0x00400000, %eax # returning to 32bit stack? 423 testl $0x00400000, %eax # returning to 32bit stack?
414 jnz restore_nocheck # allright, normal return 424 jnz restore_nocheck # allright, normal return
425
426#ifdef CONFIG_PARAVIRT
427 /*
428 * The kernel can't run on a non-flat stack if paravirt mode
429 * is active. Rather than try to fixup the high bits of
430 * ESP, bypass this code entirely. This may break DOSemu
431 * and/or Wine support in a paravirt VM, although the option
432 * is still available to implement the setting of the high
433 * 16-bits in the INTERRUPT_RETURN paravirt-op.
434 */
435 cmpl $0, paravirt_ops+PARAVIRT_enabled
436 jne restore_nocheck
437#endif
438
415 /* If returning to userspace with 16bit stack, 439 /* If returning to userspace with 16bit stack,
416 * try to fix the higher word of ESP, as the CPU 440 * try to fix the higher word of ESP, as the CPU
417 * won't restore it. 441 * won't restore it.
418 * This is an "official" bug of all the x86-compatible 442 * This is an "official" bug of all the x86-compatible
419 * CPUs, which we can try to work around to make 443 * CPUs, which we can try to work around to make
420 * dosemu and wine happy. */ 444 * dosemu and wine happy. */
421 subl $8, %esp # reserve space for switch16 pointer 445 movl PT_OLDESP(%esp), %eax
422 CFI_ADJUST_CFA_OFFSET 8 446 movl %esp, %edx
423 DISABLE_INTERRUPTS 447 call patch_espfix_desc
448 pushl $__ESPFIX_SS
449 CFI_ADJUST_CFA_OFFSET 4
450 pushl %eax
451 CFI_ADJUST_CFA_OFFSET 4
452 DISABLE_INTERRUPTS(CLBR_EAX)
424 TRACE_IRQS_OFF 453 TRACE_IRQS_OFF
425 movl %esp, %eax 454 lss (%esp), %esp
426 /* Set up the 16bit stack frame with switch32 pointer on top, 455 CFI_ADJUST_CFA_OFFSET -8
427 * and a switch16 pointer on top of the current frame. */ 456 jmp restore_nocheck
428 call setup_x86_bogus_stack
429 CFI_ADJUST_CFA_OFFSET -8 # frame has moved
430 TRACE_IRQS_IRET
431 RESTORE_REGS
432 lss 20+4(%esp), %esp # switch to 16bit stack
4331: INTERRUPT_RETURN
434.section __ex_table,"a"
435 .align 4
436 .long 1b,iret_exc
437.previous
438 CFI_ENDPROC 457 CFI_ENDPROC
439 458
440 # perform work that needs to be done immediately before resumption 459 # perform work that needs to be done immediately before resumption
@@ -445,7 +464,7 @@ work_pending:
445 jz work_notifysig 464 jz work_notifysig
446work_resched: 465work_resched:
447 call schedule 466 call schedule
448 DISABLE_INTERRUPTS # make sure we don't miss an interrupt 467 DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
449 # setting need_resched or sigpending 468 # setting need_resched or sigpending
450 # between sampling and the iret 469 # between sampling and the iret
451 TRACE_IRQS_OFF 470 TRACE_IRQS_OFF
@@ -458,7 +477,8 @@ work_resched:
458 477
459work_notifysig: # deal with pending signals and 478work_notifysig: # deal with pending signals and
460 # notify-resume requests 479 # notify-resume requests
461 testl $VM_MASK, EFLAGS(%esp) 480#ifdef CONFIG_VM86
481 testl $VM_MASK, PT_EFLAGS(%esp)
462 movl %esp, %eax 482 movl %esp, %eax
463 jne work_notifysig_v86 # returning to kernel-space or 483 jne work_notifysig_v86 # returning to kernel-space or
464 # vm86-space 484 # vm86-space
@@ -468,29 +488,30 @@ work_notifysig: # deal with pending signals and
468 488
469 ALIGN 489 ALIGN
470work_notifysig_v86: 490work_notifysig_v86:
471#ifdef CONFIG_VM86
472 pushl %ecx # save ti_flags for do_notify_resume 491 pushl %ecx # save ti_flags for do_notify_resume
473 CFI_ADJUST_CFA_OFFSET 4 492 CFI_ADJUST_CFA_OFFSET 4
474 call save_v86_state # %eax contains pt_regs pointer 493 call save_v86_state # %eax contains pt_regs pointer
475 popl %ecx 494 popl %ecx
476 CFI_ADJUST_CFA_OFFSET -4 495 CFI_ADJUST_CFA_OFFSET -4
477 movl %eax, %esp 496 movl %eax, %esp
497#else
498 movl %esp, %eax
499#endif
478 xorl %edx, %edx 500 xorl %edx, %edx
479 call do_notify_resume 501 call do_notify_resume
480 jmp resume_userspace_sig 502 jmp resume_userspace_sig
481#endif
482 503
483 # perform syscall exit tracing 504 # perform syscall exit tracing
484 ALIGN 505 ALIGN
485syscall_trace_entry: 506syscall_trace_entry:
486 movl $-ENOSYS,EAX(%esp) 507 movl $-ENOSYS,PT_EAX(%esp)
487 movl %esp, %eax 508 movl %esp, %eax
488 xorl %edx,%edx 509 xorl %edx,%edx
489 call do_syscall_trace 510 call do_syscall_trace
490 cmpl $0, %eax 511 cmpl $0, %eax
491 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, 512 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
492 # so must skip actual syscall 513 # so must skip actual syscall
493 movl ORIG_EAX(%esp), %eax 514 movl PT_ORIG_EAX(%esp), %eax
494 cmpl $(nr_syscalls), %eax 515 cmpl $(nr_syscalls), %eax
495 jnae syscall_call 516 jnae syscall_call
496 jmp syscall_exit 517 jmp syscall_exit
@@ -501,7 +522,7 @@ syscall_exit_work:
501 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 522 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
502 jz work_pending 523 jz work_pending
503 TRACE_IRQS_ON 524 TRACE_IRQS_ON
504 ENABLE_INTERRUPTS # could let do_syscall_trace() call 525 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
505 # schedule() instead 526 # schedule() instead
506 movl %esp, %eax 527 movl %esp, %eax
507 movl $1, %edx 528 movl $1, %edx
@@ -515,39 +536,38 @@ syscall_fault:
515 CFI_ADJUST_CFA_OFFSET 4 536 CFI_ADJUST_CFA_OFFSET 4
516 SAVE_ALL 537 SAVE_ALL
517 GET_THREAD_INFO(%ebp) 538 GET_THREAD_INFO(%ebp)
518 movl $-EFAULT,EAX(%esp) 539 movl $-EFAULT,PT_EAX(%esp)
519 jmp resume_userspace 540 jmp resume_userspace
520 541
521syscall_badsys: 542syscall_badsys:
522 movl $-ENOSYS,EAX(%esp) 543 movl $-ENOSYS,PT_EAX(%esp)
523 jmp resume_userspace 544 jmp resume_userspace
524 CFI_ENDPROC 545 CFI_ENDPROC
525 546
526#define FIXUP_ESPFIX_STACK \ 547#define FIXUP_ESPFIX_STACK \
527 movl %esp, %eax; \ 548 /* since we are on a wrong stack, we cant make it a C code :( */ \
528 /* switch to 32bit stack using the pointer on top of 16bit stack */ \ 549 movl %gs:PDA_cpu, %ebx; \
529 lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \ 550 PER_CPU(cpu_gdt_descr, %ebx); \
530 /* copy data from 16bit stack to 32bit stack */ \ 551 movl GDS_address(%ebx), %ebx; \
531 call fixup_x86_bogus_stack; \ 552 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
532 /* put ESP to the proper location */ \ 553 addl %esp, %eax; \
533 movl %eax, %esp; 554 pushl $__KERNEL_DS; \
534#define UNWIND_ESPFIX_STACK \ 555 CFI_ADJUST_CFA_OFFSET 4; \
535 pushl %eax; \ 556 pushl %eax; \
536 CFI_ADJUST_CFA_OFFSET 4; \ 557 CFI_ADJUST_CFA_OFFSET 4; \
558 lss (%esp), %esp; \
559 CFI_ADJUST_CFA_OFFSET -8;
560#define UNWIND_ESPFIX_STACK \
537 movl %ss, %eax; \ 561 movl %ss, %eax; \
538 /* see if on 16bit stack */ \ 562 /* see if on espfix stack */ \
539 cmpw $__ESPFIX_SS, %ax; \ 563 cmpw $__ESPFIX_SS, %ax; \
540 je 28f; \ 564 jne 27f; \
54127: popl %eax; \ 565 movl $__KERNEL_DS, %eax; \
542 CFI_ADJUST_CFA_OFFSET -4; \
543.section .fixup,"ax"; \
54428: movl $__KERNEL_DS, %eax; \
545 movl %eax, %ds; \ 566 movl %eax, %ds; \
546 movl %eax, %es; \ 567 movl %eax, %es; \
547 /* switch to 32bit stack */ \ 568 /* switch to normal stack */ \
548 FIXUP_ESPFIX_STACK; \ 569 FIXUP_ESPFIX_STACK; \
549 jmp 27b; \ 57027:;
550.previous
551 571
552/* 572/*
553 * Build the entry stubs and pointer table with 573 * Build the entry stubs and pointer table with
@@ -608,13 +628,16 @@ KPROBE_ENTRY(page_fault)
608 CFI_ADJUST_CFA_OFFSET 4 628 CFI_ADJUST_CFA_OFFSET 4
609 ALIGN 629 ALIGN
610error_code: 630error_code:
631 /* the function address is in %gs's slot on the stack */
632 pushl %es
633 CFI_ADJUST_CFA_OFFSET 4
634 /*CFI_REL_OFFSET es, 0*/
611 pushl %ds 635 pushl %ds
612 CFI_ADJUST_CFA_OFFSET 4 636 CFI_ADJUST_CFA_OFFSET 4
613 /*CFI_REL_OFFSET ds, 0*/ 637 /*CFI_REL_OFFSET ds, 0*/
614 pushl %eax 638 pushl %eax
615 CFI_ADJUST_CFA_OFFSET 4 639 CFI_ADJUST_CFA_OFFSET 4
616 CFI_REL_OFFSET eax, 0 640 CFI_REL_OFFSET eax, 0
617 xorl %eax, %eax
618 pushl %ebp 641 pushl %ebp
619 CFI_ADJUST_CFA_OFFSET 4 642 CFI_ADJUST_CFA_OFFSET 4
620 CFI_REL_OFFSET ebp, 0 643 CFI_REL_OFFSET ebp, 0
@@ -627,7 +650,6 @@ error_code:
627 pushl %edx 650 pushl %edx
628 CFI_ADJUST_CFA_OFFSET 4 651 CFI_ADJUST_CFA_OFFSET 4
629 CFI_REL_OFFSET edx, 0 652 CFI_REL_OFFSET edx, 0
630 decl %eax # eax = -1
631 pushl %ecx 653 pushl %ecx
632 CFI_ADJUST_CFA_OFFSET 4 654 CFI_ADJUST_CFA_OFFSET 4
633 CFI_REL_OFFSET ecx, 0 655 CFI_REL_OFFSET ecx, 0
@@ -635,18 +657,20 @@ error_code:
635 CFI_ADJUST_CFA_OFFSET 4 657 CFI_ADJUST_CFA_OFFSET 4
636 CFI_REL_OFFSET ebx, 0 658 CFI_REL_OFFSET ebx, 0
637 cld 659 cld
638 pushl %es 660 pushl %gs
639 CFI_ADJUST_CFA_OFFSET 4 661 CFI_ADJUST_CFA_OFFSET 4
640 /*CFI_REL_OFFSET es, 0*/ 662 /*CFI_REL_OFFSET gs, 0*/
663 movl $(__KERNEL_PDA), %ecx
664 movl %ecx, %gs
641 UNWIND_ESPFIX_STACK 665 UNWIND_ESPFIX_STACK
642 popl %ecx 666 popl %ecx
643 CFI_ADJUST_CFA_OFFSET -4 667 CFI_ADJUST_CFA_OFFSET -4
644 /*CFI_REGISTER es, ecx*/ 668 /*CFI_REGISTER es, ecx*/
645 movl ES(%esp), %edi # get the function address 669 movl PT_GS(%esp), %edi # get the function address
646 movl ORIG_EAX(%esp), %edx # get the error code 670 movl PT_ORIG_EAX(%esp), %edx # get the error code
647 movl %eax, ORIG_EAX(%esp) 671 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
648 movl %ecx, ES(%esp) 672 mov %ecx, PT_GS(%esp)
649 /*CFI_REL_OFFSET es, ES*/ 673 /*CFI_REL_OFFSET gs, ES*/
650 movl $(__USER_DS), %ecx 674 movl $(__USER_DS), %ecx
651 movl %ecx, %ds 675 movl %ecx, %ds
652 movl %ecx, %es 676 movl %ecx, %es
@@ -682,7 +706,7 @@ ENTRY(device_not_available)
682 GET_CR0_INTO_EAX 706 GET_CR0_INTO_EAX
683 testl $0x4, %eax # EM (math emulation bit) 707 testl $0x4, %eax # EM (math emulation bit)
684 jne device_not_available_emulate 708 jne device_not_available_emulate
685 preempt_stop 709 preempt_stop(CLBR_ANY)
686 call math_state_restore 710 call math_state_restore
687 jmp ret_from_exception 711 jmp ret_from_exception
688device_not_available_emulate: 712device_not_available_emulate:
@@ -754,7 +778,7 @@ KPROBE_ENTRY(nmi)
754 cmpw $__ESPFIX_SS, %ax 778 cmpw $__ESPFIX_SS, %ax
755 popl %eax 779 popl %eax
756 CFI_ADJUST_CFA_OFFSET -4 780 CFI_ADJUST_CFA_OFFSET -4
757 je nmi_16bit_stack 781 je nmi_espfix_stack
758 cmpl $sysenter_entry,(%esp) 782 cmpl $sysenter_entry,(%esp)
759 je nmi_stack_fixup 783 je nmi_stack_fixup
760 pushl %eax 784 pushl %eax
@@ -797,7 +821,7 @@ nmi_debug_stack_check:
797 FIX_STACK(24,nmi_stack_correct, 1) 821 FIX_STACK(24,nmi_stack_correct, 1)
798 jmp nmi_stack_correct 822 jmp nmi_stack_correct
799 823
800nmi_16bit_stack: 824nmi_espfix_stack:
801 /* We have a RING0_INT_FRAME here. 825 /* We have a RING0_INT_FRAME here.
802 * 826 *
803 * create the pointer to lss back 827 * create the pointer to lss back
@@ -806,7 +830,6 @@ nmi_16bit_stack:
806 CFI_ADJUST_CFA_OFFSET 4 830 CFI_ADJUST_CFA_OFFSET 4
807 pushl %esp 831 pushl %esp
808 CFI_ADJUST_CFA_OFFSET 4 832 CFI_ADJUST_CFA_OFFSET 4
809 movzwl %sp, %esp
810 addw $4, (%esp) 833 addw $4, (%esp)
811 /* copy the iret frame of 12 bytes */ 834 /* copy the iret frame of 12 bytes */
812 .rept 3 835 .rept 3
@@ -817,11 +840,11 @@ nmi_16bit_stack:
817 CFI_ADJUST_CFA_OFFSET 4 840 CFI_ADJUST_CFA_OFFSET 4
818 SAVE_ALL 841 SAVE_ALL
819 FIXUP_ESPFIX_STACK # %eax == %esp 842 FIXUP_ESPFIX_STACK # %eax == %esp
820 CFI_ADJUST_CFA_OFFSET -20 # the frame has now moved
821 xorl %edx,%edx # zero error code 843 xorl %edx,%edx # zero error code
822 call do_nmi 844 call do_nmi
823 RESTORE_REGS 845 RESTORE_REGS
824 lss 12+4(%esp), %esp # back to 16bit stack 846 lss 12+4(%esp), %esp # back to espfix stack
847 CFI_ADJUST_CFA_OFFSET -24
8251: INTERRUPT_RETURN 8481: INTERRUPT_RETURN
826 CFI_ENDPROC 849 CFI_ENDPROC
827.section __ex_table,"a" 850.section __ex_table,"a"
@@ -830,6 +853,19 @@ nmi_16bit_stack:
830.previous 853.previous
831KPROBE_END(nmi) 854KPROBE_END(nmi)
832 855
856#ifdef CONFIG_PARAVIRT
857ENTRY(native_iret)
8581: iret
859.section __ex_table,"a"
860 .align 4
861 .long 1b,iret_exc
862.previous
863
864ENTRY(native_irq_enable_sysexit)
865 sti
866 sysexit
867#endif
868
833KPROBE_ENTRY(int3) 869KPROBE_ENTRY(int3)
834 RING0_INT_FRAME 870 RING0_INT_FRAME
835 pushl $-1 # mark this as an int 871 pushl $-1 # mark this as an int
@@ -943,37 +979,6 @@ ENTRY(spurious_interrupt_bug)
943 jmp error_code 979 jmp error_code
944 CFI_ENDPROC 980 CFI_ENDPROC
945 981
946#ifdef CONFIG_STACK_UNWIND
947ENTRY(arch_unwind_init_running)
948 CFI_STARTPROC
949 movl 4(%esp), %edx
950 movl (%esp), %ecx
951 leal 4(%esp), %eax
952 movl %ebx, EBX(%edx)
953 xorl %ebx, %ebx
954 movl %ebx, ECX(%edx)
955 movl %ebx, EDX(%edx)
956 movl %esi, ESI(%edx)
957 movl %edi, EDI(%edx)
958 movl %ebp, EBP(%edx)
959 movl %ebx, EAX(%edx)
960 movl $__USER_DS, DS(%edx)
961 movl $__USER_DS, ES(%edx)
962 movl %ebx, ORIG_EAX(%edx)
963 movl %ecx, EIP(%edx)
964 movl 12(%esp), %ecx
965 movl $__KERNEL_CS, CS(%edx)
966 movl %ebx, EFLAGS(%edx)
967 movl %eax, OLDESP(%edx)
968 movl 8(%esp), %eax
969 movl %ecx, 8(%esp)
970 movl EBX(%edx), %ebx
971 movl $__KERNEL_DS, OLDSS(%edx)
972 jmpl *%eax
973 CFI_ENDPROC
974ENDPROC(arch_unwind_init_running)
975#endif
976
977ENTRY(kernel_thread_helper) 982ENTRY(kernel_thread_helper)
978 pushl $0 # fake return address for unwinder 983 pushl $0 # fake return address for unwinder
979 CFI_STARTPROC 984 CFI_STARTPROC
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index ca31f18d277c..edef5084ce17 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -55,6 +55,12 @@
55 */ 55 */
56ENTRY(startup_32) 56ENTRY(startup_32)
57 57
58#ifdef CONFIG_PARAVIRT
59 movl %cs, %eax
60 testl $0x3, %eax
61 jnz startup_paravirt
62#endif
63
58/* 64/*
59 * Set segments to known values. 65 * Set segments to known values.
60 */ 66 */
@@ -302,6 +308,7 @@ is386: movl $2,%ecx # set MP
302 movl %eax,%cr0 308 movl %eax,%cr0
303 309
304 call check_x87 310 call check_x87
311 call setup_pda
305 lgdt cpu_gdt_descr 312 lgdt cpu_gdt_descr
306 lidt idt_descr 313 lidt idt_descr
307 ljmp $(__KERNEL_CS),$1f 314 ljmp $(__KERNEL_CS),$1f
@@ -312,10 +319,13 @@ is386: movl $2,%ecx # set MP
312 movl %eax,%ds 319 movl %eax,%ds
313 movl %eax,%es 320 movl %eax,%es
314 321
315 xorl %eax,%eax # Clear FS/GS and LDT 322 xorl %eax,%eax # Clear FS and LDT
316 movl %eax,%fs 323 movl %eax,%fs
317 movl %eax,%gs
318 lldt %ax 324 lldt %ax
325
326 movl $(__KERNEL_PDA),%eax
327 mov %eax,%gs
328
319 cld # gcc2 wants the direction flag cleared at all times 329 cld # gcc2 wants the direction flag cleared at all times
320 pushl $0 # fake return address for unwinder 330 pushl $0 # fake return address for unwinder
321#ifdef CONFIG_SMP 331#ifdef CONFIG_SMP
@@ -346,6 +356,23 @@ check_x87:
346 ret 356 ret
347 357
348/* 358/*
359 * Point the GDT at this CPU's PDA. On boot this will be
360 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
361 * that CPU's GDT and PDA.
362 */
363setup_pda:
364 /* get the PDA pointer */
365 movl start_pda, %eax
366
367 /* slot the PDA address into the GDT */
368 mov cpu_gdt_descr+2, %ecx
369 mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
370 shr $16, %eax
371 mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
372 mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */
373 ret
374
375/*
349 * setup_idt 376 * setup_idt
350 * 377 *
351 * sets up a idt with 256 entries pointing to 378 * sets up a idt with 256 entries pointing to
@@ -465,6 +492,33 @@ ignore_int:
465#endif 492#endif
466 iret 493 iret
467 494
495#ifdef CONFIG_PARAVIRT
496startup_paravirt:
497 cld
498 movl $(init_thread_union+THREAD_SIZE),%esp
499
500 /* We take pains to preserve all the regs. */
501 pushl %edx
502 pushl %ecx
503 pushl %eax
504
505 /* paravirt.o is last in link, and that probe fn never returns */
506 pushl $__start_paravirtprobe
5071:
508 movl 0(%esp), %eax
509 pushl (%eax)
510 movl 8(%esp), %eax
511 call *(%esp)
512 popl %eax
513
514 movl 4(%esp), %eax
515 movl 8(%esp), %ecx
516 movl 12(%esp), %edx
517
518 addl $4, (%esp)
519 jmp 1b
520#endif
521
468/* 522/*
469 * Real beginning of normal "text" segment 523 * Real beginning of normal "text" segment
470 */ 524 */
@@ -484,6 +538,8 @@ ENTRY(empty_zero_page)
484 * This starts the data section. 538 * This starts the data section.
485 */ 539 */
486.data 540.data
541ENTRY(start_pda)
542 .long boot_pda
487 543
488ENTRY(stack_start) 544ENTRY(stack_start)
489 .long init_thread_union+THREAD_SIZE 545 .long init_thread_union+THREAD_SIZE
@@ -525,7 +581,7 @@ idt_descr:
525 581
526# boot GDT descriptor (later on used by CPU#0): 582# boot GDT descriptor (later on used by CPU#0):
527 .word 0 # 32 bit align gdt_desc.address 583 .word 0 # 32 bit align gdt_desc.address
528cpu_gdt_descr: 584ENTRY(cpu_gdt_descr)
529 .word GDT_ENTRIES*8-1 585 .word GDT_ENTRIES*8-1
530 .long cpu_gdt_table 586 .long cpu_gdt_table
531 587
@@ -584,8 +640,8 @@ ENTRY(cpu_gdt_table)
584 .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */ 640 .quad 0x00009a000000ffff /* 0xc0 APM CS 16 code (16 bit) */
585 .quad 0x004092000000ffff /* 0xc8 APM DS data */ 641 .quad 0x004092000000ffff /* 0xc8 APM DS data */
586 642
587 .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */ 643 .quad 0x00c0920000000000 /* 0xd0 - ESPFIX SS */
588 .quad 0x0000000000000000 /* 0xd8 - unused */ 644 .quad 0x00cf92000000ffff /* 0xd8 - PDA */
589 .quad 0x0000000000000000 /* 0xe0 - unused */ 645 .quad 0x0000000000000000 /* 0xe0 - unused */
590 .quad 0x0000000000000000 /* 0xe8 - unused */ 646 .quad 0x0000000000000000 /* 0xe8 - unused */
591 .quad 0x0000000000000000 /* 0xf0 - unused */ 647 .quad 0x0000000000000000 /* 0xf0 - unused */
diff --git a/arch/i386/kernel/hpet.c b/arch/i386/kernel/hpet.c
index 17647a530b2f..45a8685bb60b 100644
--- a/arch/i386/kernel/hpet.c
+++ b/arch/i386/kernel/hpet.c
@@ -34,6 +34,7 @@ static int __init init_hpet_clocksource(void)
34 unsigned long hpet_period; 34 unsigned long hpet_period;
35 void __iomem* hpet_base; 35 void __iomem* hpet_base;
36 u64 tmp; 36 u64 tmp;
37 int err;
37 38
38 if (!is_hpet_enabled()) 39 if (!is_hpet_enabled())
39 return -ENODEV; 40 return -ENODEV;
@@ -61,7 +62,11 @@ static int __init init_hpet_clocksource(void)
61 do_div(tmp, FSEC_PER_NSEC); 62 do_div(tmp, FSEC_PER_NSEC);
62 clocksource_hpet.mult = (u32)tmp; 63 clocksource_hpet.mult = (u32)tmp;
63 64
64 return clocksource_register(&clocksource_hpet); 65 err = clocksource_register(&clocksource_hpet);
66 if (err)
67 iounmap(hpet_base);
68
69 return err;
65} 70}
66 71
67module_init(init_hpet_clocksource); 72module_init(init_hpet_clocksource);
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index 62996cd17084..c8d45821c788 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -381,7 +381,10 @@ void __init init_ISA_irqs (void)
381 } 381 }
382} 382}
383 383
384void __init init_IRQ(void) 384/* Overridden in paravirt.c */
385void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
386
387void __init native_init_IRQ(void)
385{ 388{
386 int i; 389 int i;
387 390
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 3b7a63e0ed1a..2424cc9c7b3d 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -34,6 +34,7 @@
34#include <linux/pci.h> 34#include <linux/pci.h>
35#include <linux/msi.h> 35#include <linux/msi.h>
36#include <linux/htirq.h> 36#include <linux/htirq.h>
37#include <linux/freezer.h>
37 38
38#include <asm/io.h> 39#include <asm/io.h>
39#include <asm/smp.h> 40#include <asm/smp.h>
@@ -153,14 +154,20 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
153 * the interrupt, and we need to make sure the entry is fully populated 154 * the interrupt, and we need to make sure the entry is fully populated
154 * before that happens. 155 * before that happens.
155 */ 156 */
156static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 157static void
158__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
157{ 159{
158 unsigned long flags;
159 union entry_union eu; 160 union entry_union eu;
160 eu.entry = e; 161 eu.entry = e;
161 spin_lock_irqsave(&ioapic_lock, flags);
162 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 162 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
163 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 163 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
164}
165
166static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
167{
168 unsigned long flags;
169 spin_lock_irqsave(&ioapic_lock, flags);
170 __ioapic_write_entry(apic, pin, e);
164 spin_unlock_irqrestore(&ioapic_lock, flags); 171 spin_unlock_irqrestore(&ioapic_lock, flags);
165} 172}
166 173
@@ -836,8 +843,7 @@ static int __init find_isa_irq_pin(int irq, int type)
836 843
837 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 844 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
838 mp_bus_id_to_type[lbus] == MP_BUS_EISA || 845 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
839 mp_bus_id_to_type[lbus] == MP_BUS_MCA || 846 mp_bus_id_to_type[lbus] == MP_BUS_MCA
840 mp_bus_id_to_type[lbus] == MP_BUS_NEC98
841 ) && 847 ) &&
842 (mp_irqs[i].mpc_irqtype == type) && 848 (mp_irqs[i].mpc_irqtype == type) &&
843 (mp_irqs[i].mpc_srcbusirq == irq)) 849 (mp_irqs[i].mpc_srcbusirq == irq))
@@ -856,8 +862,7 @@ static int __init find_isa_irq_apic(int irq, int type)
856 862
857 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || 863 if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
858 mp_bus_id_to_type[lbus] == MP_BUS_EISA || 864 mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
859 mp_bus_id_to_type[lbus] == MP_BUS_MCA || 865 mp_bus_id_to_type[lbus] == MP_BUS_MCA
860 mp_bus_id_to_type[lbus] == MP_BUS_NEC98
861 ) && 866 ) &&
862 (mp_irqs[i].mpc_irqtype == type) && 867 (mp_irqs[i].mpc_irqtype == type) &&
863 (mp_irqs[i].mpc_srcbusirq == irq)) 868 (mp_irqs[i].mpc_srcbusirq == irq))
@@ -987,12 +992,6 @@ static int EISA_ELCR(unsigned int irq)
987#define default_MCA_trigger(idx) (1) 992#define default_MCA_trigger(idx) (1)
988#define default_MCA_polarity(idx) (0) 993#define default_MCA_polarity(idx) (0)
989 994
990/* NEC98 interrupts are always polarity zero edge triggered,
991 * when listed as conforming in the MP table. */
992
993#define default_NEC98_trigger(idx) (0)
994#define default_NEC98_polarity(idx) (0)
995
996static int __init MPBIOS_polarity(int idx) 995static int __init MPBIOS_polarity(int idx)
997{ 996{
998 int bus = mp_irqs[idx].mpc_srcbus; 997 int bus = mp_irqs[idx].mpc_srcbus;
@@ -1027,11 +1026,6 @@ static int __init MPBIOS_polarity(int idx)
1027 polarity = default_MCA_polarity(idx); 1026 polarity = default_MCA_polarity(idx);
1028 break; 1027 break;
1029 } 1028 }
1030 case MP_BUS_NEC98: /* NEC 98 pin */
1031 {
1032 polarity = default_NEC98_polarity(idx);
1033 break;
1034 }
1035 default: 1029 default:
1036 { 1030 {
1037 printk(KERN_WARNING "broken BIOS!!\n"); 1031 printk(KERN_WARNING "broken BIOS!!\n");
@@ -1101,11 +1095,6 @@ static int MPBIOS_trigger(int idx)
1101 trigger = default_MCA_trigger(idx); 1095 trigger = default_MCA_trigger(idx);
1102 break; 1096 break;
1103 } 1097 }
1104 case MP_BUS_NEC98: /* NEC 98 pin */
1105 {
1106 trigger = default_NEC98_trigger(idx);
1107 break;
1108 }
1109 default: 1098 default:
1110 { 1099 {
1111 printk(KERN_WARNING "broken BIOS!!\n"); 1100 printk(KERN_WARNING "broken BIOS!!\n");
@@ -1167,7 +1156,6 @@ static int pin_2_irq(int idx, int apic, int pin)
1167 case MP_BUS_ISA: /* ISA pin */ 1156 case MP_BUS_ISA: /* ISA pin */
1168 case MP_BUS_EISA: 1157 case MP_BUS_EISA:
1169 case MP_BUS_MCA: 1158 case MP_BUS_MCA:
1170 case MP_BUS_NEC98:
1171 { 1159 {
1172 irq = mp_irqs[idx].mpc_srcbusirq; 1160 irq = mp_irqs[idx].mpc_srcbusirq;
1173 break; 1161 break;
@@ -1235,7 +1223,7 @@ static inline int IO_APIC_irq_trigger(int irq)
1235} 1223}
1236 1224
1237/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ 1225/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1238u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; 1226static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
1239 1227
1240static int __assign_irq_vector(int irq) 1228static int __assign_irq_vector(int irq)
1241{ 1229{
@@ -1360,8 +1348,8 @@ static void __init setup_IO_APIC_irqs(void)
1360 if (!apic && (irq < 16)) 1348 if (!apic && (irq < 16))
1361 disable_8259A_irq(irq); 1349 disable_8259A_irq(irq);
1362 } 1350 }
1363 ioapic_write_entry(apic, pin, entry);
1364 spin_lock_irqsave(&ioapic_lock, flags); 1351 spin_lock_irqsave(&ioapic_lock, flags);
1352 __ioapic_write_entry(apic, pin, entry);
1365 set_native_irq_info(irq, TARGET_CPUS); 1353 set_native_irq_info(irq, TARGET_CPUS);
1366 spin_unlock_irqrestore(&ioapic_lock, flags); 1354 spin_unlock_irqrestore(&ioapic_lock, flags);
1367 } 1355 }
@@ -1926,6 +1914,15 @@ static void __init setup_ioapic_ids_from_mpc(void)
1926static void __init setup_ioapic_ids_from_mpc(void) { } 1914static void __init setup_ioapic_ids_from_mpc(void) { }
1927#endif 1915#endif
1928 1916
1917static int no_timer_check __initdata;
1918
1919static int __init notimercheck(char *s)
1920{
1921 no_timer_check = 1;
1922 return 1;
1923}
1924__setup("no_timer_check", notimercheck);
1925
1929/* 1926/*
1930 * There is a nasty bug in some older SMP boards, their mptable lies 1927 * There is a nasty bug in some older SMP boards, their mptable lies
1931 * about the timer IRQ. We do the following to work around the situation: 1928 * about the timer IRQ. We do the following to work around the situation:
@@ -1934,10 +1931,13 @@ static void __init setup_ioapic_ids_from_mpc(void) { }
1934 * - if this function detects that timer IRQs are defunct, then we fall 1931 * - if this function detects that timer IRQs are defunct, then we fall
1935 * back to ISA timer IRQs 1932 * back to ISA timer IRQs
1936 */ 1933 */
1937static int __init timer_irq_works(void) 1934int __init timer_irq_works(void)
1938{ 1935{
1939 unsigned long t1 = jiffies; 1936 unsigned long t1 = jiffies;
1940 1937
1938 if (no_timer_check)
1939 return 1;
1940
1941 local_irq_enable(); 1941 local_irq_enable();
1942 /* Let ten ticks pass... */ 1942 /* Let ten ticks pass... */
1943 mdelay((10 * 1000) / HZ); 1943 mdelay((10 * 1000) / HZ);
@@ -2161,9 +2161,15 @@ static inline void unlock_ExtINT_logic(void)
2161 unsigned char save_control, save_freq_select; 2161 unsigned char save_control, save_freq_select;
2162 2162
2163 pin = find_isa_irq_pin(8, mp_INT); 2163 pin = find_isa_irq_pin(8, mp_INT);
2164 if (pin == -1) {
2165 WARN_ON_ONCE(1);
2166 return;
2167 }
2164 apic = find_isa_irq_apic(8, mp_INT); 2168 apic = find_isa_irq_apic(8, mp_INT);
2165 if (pin == -1) 2169 if (apic == -1) {
2170 WARN_ON_ONCE(1);
2166 return; 2171 return;
2172 }
2167 2173
2168 entry0 = ioapic_read_entry(apic, pin); 2174 entry0 = ioapic_read_entry(apic, pin);
2169 clear_IO_APIC_pin(apic, pin); 2175 clear_IO_APIC_pin(apic, pin);
@@ -2208,7 +2214,7 @@ int timer_uses_ioapic_pin_0;
2208 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast 2214 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2209 * fanatically on his truly buggy board. 2215 * fanatically on his truly buggy board.
2210 */ 2216 */
2211static inline void check_timer(void) 2217static inline void __init check_timer(void)
2212{ 2218{
2213 int apic1, pin1, apic2, pin2; 2219 int apic1, pin1, apic2, pin2;
2214 int vector; 2220 int vector;
@@ -2479,7 +2485,7 @@ device_initcall(ioapic_init_sysfs);
2479int create_irq(void) 2485int create_irq(void)
2480{ 2486{
2481 /* Allocate an unused irq */ 2487 /* Allocate an unused irq */
2482 int irq, new, vector; 2488 int irq, new, vector = 0;
2483 unsigned long flags; 2489 unsigned long flags;
2484 2490
2485 irq = -ENOSPC; 2491 irq = -ENOSPC;
@@ -2856,8 +2862,8 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
2856 if (!ioapic && (irq < 16)) 2862 if (!ioapic && (irq < 16))
2857 disable_8259A_irq(irq); 2863 disable_8259A_irq(irq);
2858 2864
2859 ioapic_write_entry(ioapic, pin, entry);
2860 spin_lock_irqsave(&ioapic_lock, flags); 2865 spin_lock_irqsave(&ioapic_lock, flags);
2866 __ioapic_write_entry(ioapic, pin, entry);
2861 set_native_irq_info(irq, TARGET_CPUS); 2867 set_native_irq_info(irq, TARGET_CPUS);
2862 spin_unlock_irqrestore(&ioapic_lock, flags); 2868 spin_unlock_irqrestore(&ioapic_lock, flags);
2863 2869
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index fc79e1e859c4..af1d53344993 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -184,7 +184,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
184void __kprobes arch_remove_kprobe(struct kprobe *p) 184void __kprobes arch_remove_kprobe(struct kprobe *p)
185{ 185{
186 mutex_lock(&kprobe_mutex); 186 mutex_lock(&kprobe_mutex);
187 free_insn_slot(p->ainsn.insn); 187 free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1));
188 mutex_unlock(&kprobe_mutex); 188 mutex_unlock(&kprobe_mutex);
189} 189}
190 190
@@ -333,7 +333,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
333 return 1; 333 return 1;
334 334
335ss_probe: 335ss_probe:
336#ifndef CONFIG_PREEMPT 336#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM)
337 if (p->ainsn.boostable == 1 && !p->post_handler){ 337 if (p->ainsn.boostable == 1 && !p->post_handler){
338 /* Boost up -- we can execute copied instructions directly */ 338 /* Boost up -- we can execute copied instructions directly */
339 reset_current_kprobe(); 339 reset_current_kprobe();
diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c
index 445211eb2d57..b410e5fb034f 100644
--- a/arch/i386/kernel/ldt.c
+++ b/arch/i386/kernel/ldt.c
@@ -160,16 +160,14 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount)
160{ 160{
161 int err; 161 int err;
162 unsigned long size; 162 unsigned long size;
163 void *address;
164 163
165 err = 0; 164 err = 0;
166 address = &default_ldt[0];
167 size = 5*sizeof(struct desc_struct); 165 size = 5*sizeof(struct desc_struct);
168 if (size > bytecount) 166 if (size > bytecount)
169 size = bytecount; 167 size = bytecount;
170 168
171 err = size; 169 err = size;
172 if (copy_to_user(ptr, address, size)) 170 if (clear_user(ptr, size))
173 err = -EFAULT; 171 err = -EFAULT;
174 172
175 return err; 173 return err;
diff --git a/arch/i386/kernel/mca.c b/arch/i386/kernel/mca.c
index eb57a851789d..b83672b89527 100644
--- a/arch/i386/kernel/mca.c
+++ b/arch/i386/kernel/mca.c
@@ -283,10 +283,9 @@ static int __init mca_init(void)
283 bus->f.mca_transform_memory = mca_dummy_transform_memory; 283 bus->f.mca_transform_memory = mca_dummy_transform_memory;
284 284
285 /* get the motherboard device */ 285 /* get the motherboard device */
286 mca_dev = kmalloc(sizeof(struct mca_device), GFP_KERNEL); 286 mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
287 if(unlikely(!mca_dev)) 287 if(unlikely(!mca_dev))
288 goto out_nomem; 288 goto out_nomem;
289 memset(mca_dev, 0, sizeof(struct mca_device));
290 289
291 /* 290 /*
292 * We do not expect many MCA interrupts during initialization, 291 * We do not expect many MCA interrupts during initialization,
@@ -310,11 +309,9 @@ static int __init mca_init(void)
310 mca_dev->slot = MCA_MOTHERBOARD; 309 mca_dev->slot = MCA_MOTHERBOARD;
311 mca_register_device(MCA_PRIMARY_BUS, mca_dev); 310 mca_register_device(MCA_PRIMARY_BUS, mca_dev);
312 311
313 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 312 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
314 if(unlikely(!mca_dev)) 313 if(unlikely(!mca_dev))
315 goto out_unlock_nomem; 314 goto out_unlock_nomem;
316 memset(mca_dev, 0, sizeof(struct mca_device));
317
318 315
319 /* Put motherboard into video setup mode, read integrated video 316 /* Put motherboard into video setup mode, read integrated video
320 * POS registers, and turn motherboard setup off. 317 * POS registers, and turn motherboard setup off.
@@ -349,10 +346,9 @@ static int __init mca_init(void)
349 } 346 }
350 if(which_scsi) { 347 if(which_scsi) {
351 /* found a scsi card */ 348 /* found a scsi card */
352 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 349 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
353 if(unlikely(!mca_dev)) 350 if(unlikely(!mca_dev))
354 goto out_unlock_nomem; 351 goto out_unlock_nomem;
355 memset(mca_dev, 0, sizeof(struct mca_device));
356 352
357 for(j = 0; j < 8; j++) 353 for(j = 0; j < 8; j++)
358 mca_dev->pos[j] = pos[j]; 354 mca_dev->pos[j] = pos[j];
@@ -378,10 +374,9 @@ static int __init mca_init(void)
378 if(!mca_read_and_store_pos(pos)) 374 if(!mca_read_and_store_pos(pos))
379 continue; 375 continue;
380 376
381 mca_dev = kmalloc(sizeof(struct mca_device), GFP_ATOMIC); 377 mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
382 if(unlikely(!mca_dev)) 378 if(unlikely(!mca_dev))
383 goto out_unlock_nomem; 379 goto out_unlock_nomem;
384 memset(mca_dev, 0, sizeof(struct mca_device));
385 380
386 for(j=0; j<8; j++) 381 for(j=0; j<8; j++)
387 mca_dev->pos[j]=pos[j]; 382 mca_dev->pos[j]=pos[j];
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index 23f5984d0654..c8fa13721bcb 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Intel CPU Microcode Update Driver for Linux 2 * Intel CPU Microcode Update Driver for Linux
3 * 3 *
4 * Copyright (C) 2000-2004 Tigran Aivazian 4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com> 5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 * 6 *
7 * This driver allows to upgrade microcode on Intel processors 7 * This driver allows to upgrade microcode on Intel processors
@@ -92,7 +92,7 @@
92#include <asm/processor.h> 92#include <asm/processor.h>
93 93
94MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); 94MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
95MODULE_AUTHOR("Tigran Aivazian <tigran@veritas.com>"); 95MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
96MODULE_LICENSE("GPL"); 96MODULE_LICENSE("GPL");
97 97
98#define MICROCODE_VERSION "1.14a" 98#define MICROCODE_VERSION "1.14a"
@@ -703,7 +703,6 @@ static struct sysdev_driver mc_sysdev_driver = {
703 .resume = mc_sysdev_resume, 703 .resume = mc_sysdev_resume,
704}; 704};
705 705
706#ifdef CONFIG_HOTPLUG_CPU
707static __cpuinit int 706static __cpuinit int
708mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) 707mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
709{ 708{
@@ -723,10 +722,9 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
723 return NOTIFY_OK; 722 return NOTIFY_OK;
724} 723}
725 724
726static struct notifier_block mc_cpu_notifier = { 725static struct notifier_block __cpuinitdata mc_cpu_notifier = {
727 .notifier_call = mc_cpu_callback, 726 .notifier_call = mc_cpu_callback,
728}; 727};
729#endif
730 728
731static int __init microcode_init (void) 729static int __init microcode_init (void)
732{ 730{
@@ -754,7 +752,7 @@ static int __init microcode_init (void)
754 register_hotcpu_notifier(&mc_cpu_notifier); 752 register_hotcpu_notifier(&mc_cpu_notifier);
755 753
756 printk(KERN_INFO 754 printk(KERN_INFO
757 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@veritas.com>\n"); 755 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
758 return 0; 756 return 0;
759} 757}
760 758
diff --git a/arch/i386/kernel/module.c b/arch/i386/kernel/module.c
index 470cf97e7cd3..3db0a5442eb1 100644
--- a/arch/i386/kernel/module.c
+++ b/arch/i386/kernel/module.c
@@ -21,6 +21,7 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/bug.h>
24 25
25#if 0 26#if 0
26#define DEBUGP printk 27#define DEBUGP printk
@@ -108,7 +109,8 @@ int module_finalize(const Elf_Ehdr *hdr,
108 const Elf_Shdr *sechdrs, 109 const Elf_Shdr *sechdrs,
109 struct module *me) 110 struct module *me)
110{ 111{
111 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 112 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
113 *para = NULL;
112 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 114 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
113 115
114 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 116 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -118,6 +120,8 @@ int module_finalize(const Elf_Ehdr *hdr,
118 alt = s; 120 alt = s;
119 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 121 if (!strcmp(".smp_locks", secstrings + s->sh_name))
120 locks= s; 122 locks= s;
123 if (!strcmp(".parainstructions", secstrings + s->sh_name))
124 para = s;
121 } 125 }
122 126
123 if (alt) { 127 if (alt) {
@@ -132,10 +136,17 @@ int module_finalize(const Elf_Ehdr *hdr,
132 lseg, lseg + locks->sh_size, 136 lseg, lseg + locks->sh_size,
133 tseg, tseg + text->sh_size); 137 tseg, tseg + text->sh_size);
134 } 138 }
135 return 0; 139
140 if (para) {
141 void *pseg = (void *)para->sh_addr;
142 apply_paravirt(pseg, pseg + para->sh_size);
143 }
144
145 return module_bug_finalize(hdr, sechdrs, me);
136} 146}
137 147
138void module_arch_cleanup(struct module *mod) 148void module_arch_cleanup(struct module *mod)
139{ 149{
140 alternatives_smp_module_del(mod); 150 alternatives_smp_module_del(mod);
151 module_bug_cleanup(mod);
141} 152}
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 442aaf8c77eb..49bff3596bff 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -36,7 +36,7 @@
36 36
37/* Have we found an MP table */ 37/* Have we found an MP table */
38int smp_found_config; 38int smp_found_config;
39unsigned int __initdata maxcpus = NR_CPUS; 39unsigned int __cpuinitdata maxcpus = NR_CPUS;
40 40
41/* 41/*
42 * Various Linux-internal data structures created from the 42 * Various Linux-internal data structures created from the
@@ -102,9 +102,9 @@ static int __init mpf_checksum(unsigned char *mp, int len)
102 */ 102 */
103 103
104static int mpc_record; 104static int mpc_record;
105static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata; 105static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
106 106
107static void __devinit MP_processor_info (struct mpc_config_processor *m) 107static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
108{ 108{
109 int ver, apicid; 109 int ver, apicid;
110 physid_mask_t phys_cpu; 110 physid_mask_t phys_cpu;
@@ -249,8 +249,6 @@ static void __init MP_bus_info (struct mpc_config_bus *m)
249 mp_current_pci_id++; 249 mp_current_pci_id++;
250 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { 250 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
251 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; 251 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
252 } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
253 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
254 } else { 252 } else {
255 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 253 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
256 } 254 }
@@ -824,7 +822,7 @@ void __init mp_register_lapic_address(u64 address)
824 Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); 822 Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
825} 823}
826 824
827void __devinit mp_register_lapic (u8 id, u8 enabled) 825void __cpuinit mp_register_lapic (u8 id, u8 enabled)
828{ 826{
829 struct mpc_config_processor processor; 827 struct mpc_config_processor processor;
830 int boot_cpu = 0; 828 int boot_cpu = 0;
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index d535cdbbfd25..4a472a17d1c6 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -172,7 +172,7 @@ static ssize_t msr_read(struct file *file, char __user * buf,
172 u32 __user *tmp = (u32 __user *) buf; 172 u32 __user *tmp = (u32 __user *) buf;
173 u32 data[2]; 173 u32 data[2];
174 u32 reg = *ppos; 174 u32 reg = *ppos;
175 int cpu = iminor(file->f_dentry->d_inode); 175 int cpu = iminor(file->f_path.dentry->d_inode);
176 int err; 176 int err;
177 177
178 if (count % 8) 178 if (count % 8)
@@ -195,15 +195,14 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
195{ 195{
196 const u32 __user *tmp = (const u32 __user *)buf; 196 const u32 __user *tmp = (const u32 __user *)buf;
197 u32 data[2]; 197 u32 data[2];
198 size_t rv;
199 u32 reg = *ppos; 198 u32 reg = *ppos;
200 int cpu = iminor(file->f_dentry->d_inode); 199 int cpu = iminor(file->f_path.dentry->d_inode);
201 int err; 200 int err;
202 201
203 if (count % 8) 202 if (count % 8)
204 return -EINVAL; /* Invalid chunk size */ 203 return -EINVAL; /* Invalid chunk size */
205 204
206 for (rv = 0; count; count -= 8) { 205 for (; count; count -= 8) {
207 if (copy_from_user(&data, tmp, 8)) 206 if (copy_from_user(&data, tmp, 8))
208 return -EFAULT; 207 return -EFAULT;
209 err = do_wrmsr(cpu, reg, data[0], data[1]); 208 err = do_wrmsr(cpu, reg, data[0], data[1]);
@@ -217,7 +216,7 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
217 216
218static int msr_open(struct inode *inode, struct file *file) 217static int msr_open(struct inode *inode, struct file *file)
219{ 218{
220 unsigned int cpu = iminor(file->f_dentry->d_inode); 219 unsigned int cpu = iminor(file->f_path.dentry->d_inode);
221 struct cpuinfo_x86 *c = &(cpu_data)[cpu]; 220 struct cpuinfo_x86 *c = &(cpu_data)[cpu];
222 221
223 if (cpu >= NR_CPUS || !cpu_online(cpu)) 222 if (cpu >= NR_CPUS || !cpu_online(cpu))
@@ -239,18 +238,17 @@ static struct file_operations msr_fops = {
239 .open = msr_open, 238 .open = msr_open,
240}; 239};
241 240
242static int msr_class_device_create(int i) 241static int msr_device_create(int i)
243{ 242{
244 int err = 0; 243 int err = 0;
245 struct class_device *class_err; 244 struct device *dev;
246 245
247 class_err = class_device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), NULL, "msr%d",i); 246 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i);
248 if (IS_ERR(class_err)) 247 if (IS_ERR(dev))
249 err = PTR_ERR(class_err); 248 err = PTR_ERR(dev);
250 return err; 249 return err;
251} 250}
252 251
253#ifdef CONFIG_HOTPLUG_CPU
254static int msr_class_cpu_callback(struct notifier_block *nfb, 252static int msr_class_cpu_callback(struct notifier_block *nfb,
255 unsigned long action, void *hcpu) 253 unsigned long action, void *hcpu)
256{ 254{
@@ -258,10 +256,10 @@ static int msr_class_cpu_callback(struct notifier_block *nfb,
258 256
259 switch (action) { 257 switch (action) {
260 case CPU_ONLINE: 258 case CPU_ONLINE:
261 msr_class_device_create(cpu); 259 msr_device_create(cpu);
262 break; 260 break;
263 case CPU_DEAD: 261 case CPU_DEAD:
264 class_device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); 262 device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
265 break; 263 break;
266 } 264 }
267 return NOTIFY_OK; 265 return NOTIFY_OK;
@@ -271,7 +269,6 @@ static struct notifier_block __cpuinitdata msr_class_cpu_notifier =
271{ 269{
272 .notifier_call = msr_class_cpu_callback, 270 .notifier_call = msr_class_cpu_callback,
273}; 271};
274#endif
275 272
276static int __init msr_init(void) 273static int __init msr_init(void)
277{ 274{
@@ -290,7 +287,7 @@ static int __init msr_init(void)
290 goto out_chrdev; 287 goto out_chrdev;
291 } 288 }
292 for_each_online_cpu(i) { 289 for_each_online_cpu(i) {
293 err = msr_class_device_create(i); 290 err = msr_device_create(i);
294 if (err != 0) 291 if (err != 0)
295 goto out_class; 292 goto out_class;
296 } 293 }
@@ -302,7 +299,7 @@ static int __init msr_init(void)
302out_class: 299out_class:
303 i = 0; 300 i = 0;
304 for_each_online_cpu(i) 301 for_each_online_cpu(i)
305 class_device_destroy(msr_class, MKDEV(MSR_MAJOR, i)); 302 device_destroy(msr_class, MKDEV(MSR_MAJOR, i));
306 class_destroy(msr_class); 303 class_destroy(msr_class);
307out_chrdev: 304out_chrdev:
308 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 305 unregister_chrdev(MSR_MAJOR, "cpu/msr");
@@ -314,7 +311,7 @@ static void __exit msr_exit(void)
314{ 311{
315 int cpu = 0; 312 int cpu = 0;
316 for_each_online_cpu(cpu) 313 for_each_online_cpu(cpu)
317 class_device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); 314 device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
318 class_destroy(msr_class); 315 class_destroy(msr_class);
319 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 316 unregister_chrdev(MSR_MAJOR, "cpu/msr");
320 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 317 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index eaafe233a5da..a5e34d655965 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -22,6 +22,7 @@
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/dmi.h> 23#include <linux/dmi.h>
24#include <linux/kprobes.h> 24#include <linux/kprobes.h>
25#include <linux/cpumask.h>
25 26
26#include <asm/smp.h> 27#include <asm/smp.h>
27#include <asm/nmi.h> 28#include <asm/nmi.h>
@@ -42,6 +43,8 @@ int nmi_watchdog_enabled;
42static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner); 43static DEFINE_PER_CPU(unsigned long, perfctr_nmi_owner);
43static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]); 44static DEFINE_PER_CPU(unsigned long, evntsel_nmi_owner[3]);
44 45
46static cpumask_t backtrace_mask = CPU_MASK_NONE;
47
45/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's 48/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
46 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) 49 * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now)
47 */ 50 */
@@ -192,6 +195,8 @@ static __cpuinit inline int nmi_known_cpu(void)
192 return 0; 195 return 0;
193} 196}
194 197
198static int endflag __initdata = 0;
199
195#ifdef CONFIG_SMP 200#ifdef CONFIG_SMP
196/* The performance counters used by NMI_LOCAL_APIC don't trigger when 201/* The performance counters used by NMI_LOCAL_APIC don't trigger when
197 * the CPU is idle. To make sure the NMI watchdog really ticks on all 202 * the CPU is idle. To make sure the NMI watchdog really ticks on all
@@ -199,7 +204,6 @@ static __cpuinit inline int nmi_known_cpu(void)
199 */ 204 */
200static __init void nmi_cpu_busy(void *data) 205static __init void nmi_cpu_busy(void *data)
201{ 206{
202 volatile int *endflag = data;
203 local_irq_enable_in_hardirq(); 207 local_irq_enable_in_hardirq();
204 /* Intentionally don't use cpu_relax here. This is 208 /* Intentionally don't use cpu_relax here. This is
205 to make sure that the performance counter really ticks, 209 to make sure that the performance counter really ticks,
@@ -207,14 +211,13 @@ static __init void nmi_cpu_busy(void *data)
207 pause instruction. On a real HT machine this is fine because 211 pause instruction. On a real HT machine this is fine because
208 all other CPUs are busy with "useless" delay loops and don't 212 all other CPUs are busy with "useless" delay loops and don't
209 care if they get somewhat less cycles. */ 213 care if they get somewhat less cycles. */
210 while (*endflag == 0) 214 while (endflag == 0)
211 barrier(); 215 mb();
212} 216}
213#endif 217#endif
214 218
215static int __init check_nmi_watchdog(void) 219static int __init check_nmi_watchdog(void)
216{ 220{
217 volatile int endflag = 0;
218 unsigned int *prev_nmi_count; 221 unsigned int *prev_nmi_count;
219 int cpu; 222 int cpu;
220 223
@@ -867,14 +870,16 @@ static unsigned int
867 870
868void touch_nmi_watchdog (void) 871void touch_nmi_watchdog (void)
869{ 872{
870 int i; 873 if (nmi_watchdog > 0) {
874 unsigned cpu;
871 875
872 /* 876 /*
873 * Just reset the alert counters, (other CPUs might be 877 * Just reset the alert counters, (other CPUs might be
874 * spinning on locks we hold): 878 * spinning on locks we hold):
875 */ 879 */
876 for_each_possible_cpu(i) 880 for_each_present_cpu (cpu)
877 alert_counter[i] = 0; 881 alert_counter[cpu] = 0;
882 }
878 883
879 /* 884 /*
880 * Tickle the softlockup detector too: 885 * Tickle the softlockup detector too:
@@ -907,6 +912,16 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
907 touched = 1; 912 touched = 1;
908 } 913 }
909 914
915 if (cpu_isset(cpu, backtrace_mask)) {
916 static DEFINE_SPINLOCK(lock); /* Serialise the printks */
917
918 spin_lock(&lock);
919 printk("NMI backtrace for cpu %d\n", cpu);
920 dump_stack();
921 spin_unlock(&lock);
922 cpu_clear(cpu, backtrace_mask);
923 }
924
910 sum = per_cpu(irq_stat, cpu).apic_timer_irqs; 925 sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
911 926
912 /* if the apic timer isn't firing, this cpu isn't doing much */ 927 /* if the apic timer isn't firing, this cpu isn't doing much */
@@ -1033,6 +1048,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file,
1033 1048
1034#endif 1049#endif
1035 1050
1051void __trigger_all_cpu_backtrace(void)
1052{
1053 int i;
1054
1055 backtrace_mask = cpu_online_map;
1056 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
1057 for (i = 0; i < 10 * 1000; i++) {
1058 if (cpus_empty(backtrace_mask))
1059 break;
1060 mdelay(1);
1061 }
1062}
1063
1036EXPORT_SYMBOL(nmi_active); 1064EXPORT_SYMBOL(nmi_active);
1037EXPORT_SYMBOL(nmi_watchdog); 1065EXPORT_SYMBOL(nmi_watchdog);
1038EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); 1066EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi);
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
new file mode 100644
index 000000000000..3dceab5828f1
--- /dev/null
+++ b/arch/i386/kernel/paravirt.c
@@ -0,0 +1,569 @@
1/* Paravirtualization interfaces
2 Copyright (C) 2006 Rusty Russell IBM Corporation
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18#include <linux/errno.h>
19#include <linux/module.h>
20#include <linux/efi.h>
21#include <linux/bcd.h>
22#include <linux/start_kernel.h>
23
24#include <asm/bug.h>
25#include <asm/paravirt.h>
26#include <asm/desc.h>
27#include <asm/setup.h>
28#include <asm/arch_hooks.h>
29#include <asm/time.h>
30#include <asm/irq.h>
31#include <asm/delay.h>
32#include <asm/fixmap.h>
33#include <asm/apic.h>
34#include <asm/tlbflush.h>
35
36/* nop stub */
37static void native_nop(void)
38{
39}
40
41static void __init default_banner(void)
42{
43 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
44 paravirt_ops.name);
45}
46
47char *memory_setup(void)
48{
49 return paravirt_ops.memory_setup();
50}
51
52/* Simple instruction patching code. */
53#define DEF_NATIVE(name, code) \
54 extern const char start_##name[], end_##name[]; \
55 asm("start_" #name ": " code "; end_" #name ":")
56DEF_NATIVE(cli, "cli");
57DEF_NATIVE(sti, "sti");
58DEF_NATIVE(popf, "push %eax; popf");
59DEF_NATIVE(pushf, "pushf; pop %eax");
60DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli");
61DEF_NATIVE(iret, "iret");
62DEF_NATIVE(sti_sysexit, "sti; sysexit");
63
64static const struct native_insns
65{
66 const char *start, *end;
67} native_insns[] = {
68 [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
69 [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
70 [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
71 [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
72 [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
73 [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
74 [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
75};
76
77static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
78{
79 unsigned int insn_len;
80
81 /* Don't touch it if we don't have a replacement */
82 if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
83 return len;
84
85 insn_len = native_insns[type].end - native_insns[type].start;
86
87 /* Similarly if we can't fit replacement. */
88 if (len < insn_len)
89 return len;
90
91 memcpy(insns, native_insns[type].start, insn_len);
92 return insn_len;
93}
94
95static fastcall unsigned long native_get_debugreg(int regno)
96{
97 unsigned long val = 0; /* Damn you, gcc! */
98
99 switch (regno) {
100 case 0:
101 asm("movl %%db0, %0" :"=r" (val)); break;
102 case 1:
103 asm("movl %%db1, %0" :"=r" (val)); break;
104 case 2:
105 asm("movl %%db2, %0" :"=r" (val)); break;
106 case 3:
107 asm("movl %%db3, %0" :"=r" (val)); break;
108 case 6:
109 asm("movl %%db6, %0" :"=r" (val)); break;
110 case 7:
111 asm("movl %%db7, %0" :"=r" (val)); break;
112 default:
113 BUG();
114 }
115 return val;
116}
117
118static fastcall void native_set_debugreg(int regno, unsigned long value)
119{
120 switch (regno) {
121 case 0:
122 asm("movl %0,%%db0" : /* no output */ :"r" (value));
123 break;
124 case 1:
125 asm("movl %0,%%db1" : /* no output */ :"r" (value));
126 break;
127 case 2:
128 asm("movl %0,%%db2" : /* no output */ :"r" (value));
129 break;
130 case 3:
131 asm("movl %0,%%db3" : /* no output */ :"r" (value));
132 break;
133 case 6:
134 asm("movl %0,%%db6" : /* no output */ :"r" (value));
135 break;
136 case 7:
137 asm("movl %0,%%db7" : /* no output */ :"r" (value));
138 break;
139 default:
140 BUG();
141 }
142}
143
144void init_IRQ(void)
145{
146 paravirt_ops.init_IRQ();
147}
148
149static fastcall void native_clts(void)
150{
151 asm volatile ("clts");
152}
153
154static fastcall unsigned long native_read_cr0(void)
155{
156 unsigned long val;
157 asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
158 return val;
159}
160
161static fastcall void native_write_cr0(unsigned long val)
162{
163 asm volatile("movl %0,%%cr0": :"r" (val));
164}
165
166static fastcall unsigned long native_read_cr2(void)
167{
168 unsigned long val;
169 asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
170 return val;
171}
172
173static fastcall void native_write_cr2(unsigned long val)
174{
175 asm volatile("movl %0,%%cr2": :"r" (val));
176}
177
178static fastcall unsigned long native_read_cr3(void)
179{
180 unsigned long val;
181 asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
182 return val;
183}
184
185static fastcall void native_write_cr3(unsigned long val)
186{
187 asm volatile("movl %0,%%cr3": :"r" (val));
188}
189
190static fastcall unsigned long native_read_cr4(void)
191{
192 unsigned long val;
193 asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
194 return val;
195}
196
197static fastcall unsigned long native_read_cr4_safe(void)
198{
199 unsigned long val;
200 /* This could fault if %cr4 does not exist */
201 asm("1: movl %%cr4, %0 \n"
202 "2: \n"
203 ".section __ex_table,\"a\" \n"
204 ".long 1b,2b \n"
205 ".previous \n"
206 : "=r" (val): "0" (0));
207 return val;
208}
209
210static fastcall void native_write_cr4(unsigned long val)
211{
212 asm volatile("movl %0,%%cr4": :"r" (val));
213}
214
215static fastcall unsigned long native_save_fl(void)
216{
217 unsigned long f;
218 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
219 return f;
220}
221
222static fastcall void native_restore_fl(unsigned long f)
223{
224 asm volatile("pushl %0 ; popfl": /* no output */
225 :"g" (f)
226 :"memory", "cc");
227}
228
229static fastcall void native_irq_disable(void)
230{
231 asm volatile("cli": : :"memory");
232}
233
234static fastcall void native_irq_enable(void)
235{
236 asm volatile("sti": : :"memory");
237}
238
239static fastcall void native_safe_halt(void)
240{
241 asm volatile("sti; hlt": : :"memory");
242}
243
244static fastcall void native_halt(void)
245{
246 asm volatile("hlt": : :"memory");
247}
248
249static fastcall void native_wbinvd(void)
250{
251 asm volatile("wbinvd": : :"memory");
252}
253
254static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
255{
256 unsigned long long val;
257
258 asm volatile("2: rdmsr ; xorl %0,%0\n"
259 "1:\n\t"
260 ".section .fixup,\"ax\"\n\t"
261 "3: movl %3,%0 ; jmp 1b\n\t"
262 ".previous\n\t"
263 ".section __ex_table,\"a\"\n"
264 " .align 4\n\t"
265 " .long 2b,3b\n\t"
266 ".previous"
267 : "=r" (*err), "=A" (val)
268 : "c" (msr), "i" (-EFAULT));
269
270 return val;
271}
272
273static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
274{
275 int err;
276 asm volatile("2: wrmsr ; xorl %0,%0\n"
277 "1:\n\t"
278 ".section .fixup,\"ax\"\n\t"
279 "3: movl %4,%0 ; jmp 1b\n\t"
280 ".previous\n\t"
281 ".section __ex_table,\"a\"\n"
282 " .align 4\n\t"
283 " .long 2b,3b\n\t"
284 ".previous"
285 : "=a" (err)
286 : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
287 "i" (-EFAULT));
288 return err;
289}
290
291static fastcall unsigned long long native_read_tsc(void)
292{
293 unsigned long long val;
294 asm volatile("rdtsc" : "=A" (val));
295 return val;
296}
297
298static fastcall unsigned long long native_read_pmc(void)
299{
300 unsigned long long val;
301 asm volatile("rdpmc" : "=A" (val));
302 return val;
303}
304
305static fastcall void native_load_tr_desc(void)
306{
307 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
308}
309
310static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
311{
312 asm volatile("lgdt %0"::"m" (*dtr));
313}
314
315static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr)
316{
317 asm volatile("lidt %0"::"m" (*dtr));
318}
319
320static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
321{
322 asm ("sgdt %0":"=m" (*dtr));
323}
324
325static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
326{
327 asm ("sidt %0":"=m" (*dtr));
328}
329
330static fastcall unsigned long native_store_tr(void)
331{
332 unsigned long tr;
333 asm ("str %0":"=r" (tr));
334 return tr;
335}
336
337static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
338{
339#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
340 C(0); C(1); C(2);
341#undef C
342}
343
344static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
345{
346 u32 *lp = (u32 *)((char *)dt + entry*8);
347 lp[0] = entry_low;
348 lp[1] = entry_high;
349}
350
351static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
352{
353 native_write_dt_entry(dt, entrynum, low, high);
354}
355
356static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
357{
358 native_write_dt_entry(dt, entrynum, low, high);
359}
360
361static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
362{
363 native_write_dt_entry(dt, entrynum, low, high);
364}
365
366static fastcall void native_load_esp0(struct tss_struct *tss,
367 struct thread_struct *thread)
368{
369 tss->esp0 = thread->esp0;
370
371 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
372 if (unlikely(tss->ss1 != thread->sysenter_cs)) {
373 tss->ss1 = thread->sysenter_cs;
374 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
375 }
376}
377
378static fastcall void native_io_delay(void)
379{
380 asm volatile("outb %al,$0x80");
381}
382
383static fastcall void native_flush_tlb(void)
384{
385 __native_flush_tlb();
386}
387
388/*
389 * Global pages have to be flushed a bit differently. Not a real
390 * performance problem because this does not happen often.
391 */
392static fastcall void native_flush_tlb_global(void)
393{
394 __native_flush_tlb_global();
395}
396
397static fastcall void native_flush_tlb_single(u32 addr)
398{
399 __native_flush_tlb_single(addr);
400}
401
402#ifndef CONFIG_X86_PAE
403static fastcall void native_set_pte(pte_t *ptep, pte_t pteval)
404{
405 *ptep = pteval;
406}
407
408static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
409{
410 *ptep = pteval;
411}
412
413static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
414{
415 *pmdp = pmdval;
416}
417
418#else /* CONFIG_X86_PAE */
419
420static fastcall void native_set_pte(pte_t *ptep, pte_t pte)
421{
422 ptep->pte_high = pte.pte_high;
423 smp_wmb();
424 ptep->pte_low = pte.pte_low;
425}
426
427static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
428{
429 ptep->pte_high = pte.pte_high;
430 smp_wmb();
431 ptep->pte_low = pte.pte_low;
432}
433
434static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
435{
436 ptep->pte_low = 0;
437 smp_wmb();
438 ptep->pte_high = pte.pte_high;
439 smp_wmb();
440 ptep->pte_low = pte.pte_low;
441}
442
443static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
444{
445 set_64bit((unsigned long long *)ptep,pte_val(pteval));
446}
447
448static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
449{
450 set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
451}
452
453static fastcall void native_set_pud(pud_t *pudp, pud_t pudval)
454{
455 *pudp = pudval;
456}
457
458static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
459{
460 ptep->pte_low = 0;
461 smp_wmb();
462 ptep->pte_high = 0;
463}
464
465static fastcall void native_pmd_clear(pmd_t *pmd)
466{
467 u32 *tmp = (u32 *)pmd;
468 *tmp = 0;
469 smp_wmb();
470 *(tmp + 1) = 0;
471}
472#endif /* CONFIG_X86_PAE */
473
474/* These are in entry.S */
475extern fastcall void native_iret(void);
476extern fastcall void native_irq_enable_sysexit(void);
477
478static int __init print_banner(void)
479{
480 paravirt_ops.banner();
481 return 0;
482}
483core_initcall(print_banner);
484
485/* We simply declare start_kernel to be the paravirt probe of last resort. */
486paravirt_probe(start_kernel);
487
488struct paravirt_ops paravirt_ops = {
489 .name = "bare hardware",
490 .paravirt_enabled = 0,
491 .kernel_rpl = 0,
492
493 .patch = native_patch,
494 .banner = default_banner,
495 .arch_setup = native_nop,
496 .memory_setup = machine_specific_memory_setup,
497 .get_wallclock = native_get_wallclock,
498 .set_wallclock = native_set_wallclock,
499 .time_init = time_init_hook,
500 .init_IRQ = native_init_IRQ,
501
502 .cpuid = native_cpuid,
503 .get_debugreg = native_get_debugreg,
504 .set_debugreg = native_set_debugreg,
505 .clts = native_clts,
506 .read_cr0 = native_read_cr0,
507 .write_cr0 = native_write_cr0,
508 .read_cr2 = native_read_cr2,
509 .write_cr2 = native_write_cr2,
510 .read_cr3 = native_read_cr3,
511 .write_cr3 = native_write_cr3,
512 .read_cr4 = native_read_cr4,
513 .read_cr4_safe = native_read_cr4_safe,
514 .write_cr4 = native_write_cr4,
515 .save_fl = native_save_fl,
516 .restore_fl = native_restore_fl,
517 .irq_disable = native_irq_disable,
518 .irq_enable = native_irq_enable,
519 .safe_halt = native_safe_halt,
520 .halt = native_halt,
521 .wbinvd = native_wbinvd,
522 .read_msr = native_read_msr,
523 .write_msr = native_write_msr,
524 .read_tsc = native_read_tsc,
525 .read_pmc = native_read_pmc,
526 .load_tr_desc = native_load_tr_desc,
527 .set_ldt = native_set_ldt,
528 .load_gdt = native_load_gdt,
529 .load_idt = native_load_idt,
530 .store_gdt = native_store_gdt,
531 .store_idt = native_store_idt,
532 .store_tr = native_store_tr,
533 .load_tls = native_load_tls,
534 .write_ldt_entry = native_write_ldt_entry,
535 .write_gdt_entry = native_write_gdt_entry,
536 .write_idt_entry = native_write_idt_entry,
537 .load_esp0 = native_load_esp0,
538
539 .set_iopl_mask = native_set_iopl_mask,
540 .io_delay = native_io_delay,
541 .const_udelay = __const_udelay,
542
543#ifdef CONFIG_X86_LOCAL_APIC
544 .apic_write = native_apic_write,
545 .apic_write_atomic = native_apic_write_atomic,
546 .apic_read = native_apic_read,
547#endif
548
549 .flush_tlb_user = native_flush_tlb,
550 .flush_tlb_kernel = native_flush_tlb_global,
551 .flush_tlb_single = native_flush_tlb_single,
552
553 .set_pte = native_set_pte,
554 .set_pte_at = native_set_pte_at,
555 .set_pmd = native_set_pmd,
556 .pte_update = (void *)native_nop,
557 .pte_update_defer = (void *)native_nop,
558#ifdef CONFIG_X86_PAE
559 .set_pte_atomic = native_set_pte_atomic,
560 .set_pte_present = native_set_pte_present,
561 .set_pud = native_set_pud,
562 .pte_clear = native_pte_clear,
563 .pmd_clear = native_pmd_clear,
564#endif
565
566 .irq_enable_sysexit = native_irq_enable_sysexit,
567 .iret = native_iret,
568};
569EXPORT_SYMBOL(paravirt_ops);
diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c
index 25fe66853934..41af692c1584 100644
--- a/arch/i386/kernel/pci-dma.c
+++ b/arch/i386/kernel/pci-dma.c
@@ -75,7 +75,7 @@ EXPORT_SYMBOL(dma_free_coherent);
75int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, 75int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
76 dma_addr_t device_addr, size_t size, int flags) 76 dma_addr_t device_addr, size_t size, int flags)
77{ 77{
78 void __iomem *mem_base; 78 void __iomem *mem_base = NULL;
79 int pages = size >> PAGE_SHIFT; 79 int pages = size >> PAGE_SHIFT;
80 int bitmap_size = (pages + 31)/32; 80 int bitmap_size = (pages + 31)/32;
81 81
@@ -92,14 +92,12 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
92 if (!mem_base) 92 if (!mem_base)
93 goto out; 93 goto out;
94 94
95 dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); 95 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
96 if (!dev->dma_mem) 96 if (!dev->dma_mem)
97 goto out; 97 goto out;
98 memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); 98 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
99 dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
100 if (!dev->dma_mem->bitmap) 99 if (!dev->dma_mem->bitmap)
101 goto free1_out; 100 goto free1_out;
102 memset(dev->dma_mem->bitmap, 0, bitmap_size);
103 101
104 dev->dma_mem->virt_base = mem_base; 102 dev->dma_mem->virt_base = mem_base;
105 dev->dma_mem->device_base = device_addr; 103 dev->dma_mem->device_base = device_addr;
@@ -114,6 +112,8 @@ int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
114 free1_out: 112 free1_out:
115 kfree(dev->dma_mem->bitmap); 113 kfree(dev->dma_mem->bitmap);
116 out: 114 out:
115 if (mem_base)
116 iounmap(mem_base);
117 return 0; 117 return 0;
118} 118}
119EXPORT_SYMBOL(dma_declare_coherent_memory); 119EXPORT_SYMBOL(dma_declare_coherent_memory);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index dd53c58f64f1..c641056233a6 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -56,6 +56,7 @@
56 56
57#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
58#include <asm/cpu.h> 58#include <asm/cpu.h>
59#include <asm/pda.h>
59 60
60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
61 62
@@ -99,22 +100,23 @@ EXPORT_SYMBOL(enable_hlt);
99 */ 100 */
100void default_idle(void) 101void default_idle(void)
101{ 102{
102 local_irq_enable();
103
104 if (!hlt_counter && boot_cpu_data.hlt_works_ok) { 103 if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
105 current_thread_info()->status &= ~TS_POLLING; 104 current_thread_info()->status &= ~TS_POLLING;
106 smp_mb__after_clear_bit(); 105 /*
107 while (!need_resched()) { 106 * TS_POLLING-cleared state must be visible before we
108 local_irq_disable(); 107 * test NEED_RESCHED:
109 if (!need_resched()) 108 */
110 safe_halt(); 109 smp_mb();
111 else 110
112 local_irq_enable(); 111 local_irq_disable();
113 } 112 if (!need_resched())
113 safe_halt(); /* enables interrupts racelessly */
114 else
115 local_irq_enable();
114 current_thread_info()->status |= TS_POLLING; 116 current_thread_info()->status |= TS_POLLING;
115 } else { 117 } else {
116 while (!need_resched()) 118 /* loop is done by the caller */
117 cpu_relax(); 119 cpu_relax();
118 } 120 }
119} 121}
120#ifdef CONFIG_APM_MODULE 122#ifdef CONFIG_APM_MODULE
@@ -128,14 +130,7 @@ EXPORT_SYMBOL(default_idle);
128 */ 130 */
129static void poll_idle (void) 131static void poll_idle (void)
130{ 132{
131 local_irq_enable(); 133 cpu_relax();
132
133 asm volatile(
134 "2:"
135 "testl %0, %1;"
136 "rep; nop;"
137 "je 2b;"
138 : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
139} 134}
140 135
141#ifdef CONFIG_HOTPLUG_CPU 136#ifdef CONFIG_HOTPLUG_CPU
@@ -256,8 +251,7 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
256static void mwait_idle(void) 251static void mwait_idle(void)
257{ 252{
258 local_irq_enable(); 253 local_irq_enable();
259 while (!need_resched()) 254 mwait_idle_with_hints(0, 0);
260 mwait_idle_with_hints(0, 0);
261} 255}
262 256
263void __devinit select_idle_routine(const struct cpuinfo_x86 *c) 257void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
@@ -314,8 +308,8 @@ void show_regs(struct pt_regs * regs)
314 regs->eax,regs->ebx,regs->ecx,regs->edx); 308 regs->eax,regs->ebx,regs->ecx,regs->edx);
315 printk("ESI: %08lx EDI: %08lx EBP: %08lx", 309 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
316 regs->esi, regs->edi, regs->ebp); 310 regs->esi, regs->edi, regs->ebp);
317 printk(" DS: %04x ES: %04x\n", 311 printk(" DS: %04x ES: %04x GS: %04x\n",
318 0xffff & regs->xds,0xffff & regs->xes); 312 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs);
319 313
320 cr0 = read_cr0(); 314 cr0 = read_cr0();
321 cr2 = read_cr2(); 315 cr2 = read_cr2();
@@ -346,6 +340,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
346 340
347 regs.xds = __USER_DS; 341 regs.xds = __USER_DS;
348 regs.xes = __USER_DS; 342 regs.xes = __USER_DS;
343 regs.xgs = __KERNEL_PDA;
349 regs.orig_eax = -1; 344 regs.orig_eax = -1;
350 regs.eip = (unsigned long) kernel_thread_helper; 345 regs.eip = (unsigned long) kernel_thread_helper;
351 regs.xcs = __KERNEL_CS | get_kernel_rpl(); 346 regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -431,7 +426,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
431 p->thread.eip = (unsigned long) ret_from_fork; 426 p->thread.eip = (unsigned long) ret_from_fork;
432 427
433 savesegment(fs,p->thread.fs); 428 savesegment(fs,p->thread.fs);
434 savesegment(gs,p->thread.gs);
435 429
436 tsk = current; 430 tsk = current;
437 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 431 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -508,7 +502,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
508 dump->regs.ds = regs->xds; 502 dump->regs.ds = regs->xds;
509 dump->regs.es = regs->xes; 503 dump->regs.es = regs->xes;
510 savesegment(fs,dump->regs.fs); 504 savesegment(fs,dump->regs.fs);
511 savesegment(gs,dump->regs.gs); 505 dump->regs.gs = regs->xgs;
512 dump->regs.orig_eax = regs->orig_eax; 506 dump->regs.orig_eax = regs->orig_eax;
513 dump->regs.eip = regs->eip; 507 dump->regs.eip = regs->eip;
514 dump->regs.cs = regs->xcs; 508 dump->regs.cs = regs->xcs;
@@ -648,22 +642,27 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
648 642
649 __unlazy_fpu(prev_p); 643 __unlazy_fpu(prev_p);
650 644
645
646 /* we're going to use this soon, after a few expensive things */
647 if (next_p->fpu_counter > 5)
648 prefetch(&next->i387.fxsave);
649
651 /* 650 /*
652 * Reload esp0. 651 * Reload esp0.
653 */ 652 */
654 load_esp0(tss, next); 653 load_esp0(tss, next);
655 654
656 /* 655 /*
657 * Save away %fs and %gs. No need to save %es and %ds, as 656 * Save away %fs. No need to save %gs, as it was saved on the
658 * those are always kernel segments while inside the kernel. 657 * stack on entry. No need to save %es and %ds, as those are
659 * Doing this before setting the new TLS descriptors avoids 658 * always kernel segments while inside the kernel. Doing this
660 * the situation where we temporarily have non-reloadable 659 * before setting the new TLS descriptors avoids the situation
661 * segments in %fs and %gs. This could be an issue if the 660 * where we temporarily have non-reloadable segments in %fs
662 * NMI handler ever used %fs or %gs (it does not today), or 661 * and %gs. This could be an issue if the NMI handler ever
663 * if the kernel is running inside of a hypervisor layer. 662 * used %fs or %gs (it does not today), or if the kernel is
663 * running inside of a hypervisor layer.
664 */ 664 */
665 savesegment(fs, prev->fs); 665 savesegment(fs, prev->fs);
666 savesegment(gs, prev->gs);
667 666
668 /* 667 /*
669 * Load the per-thread Thread-Local Storage descriptor. 668 * Load the per-thread Thread-Local Storage descriptor.
@@ -671,22 +670,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
671 load_TLS(next, cpu); 670 load_TLS(next, cpu);
672 671
673 /* 672 /*
674 * Restore %fs and %gs if needed. 673 * Restore %fs if needed.
675 * 674 *
676 * Glibc normally makes %fs be zero, and %gs is one of 675 * Glibc normally makes %fs be zero.
677 * the TLS segments.
678 */ 676 */
679 if (unlikely(prev->fs | next->fs)) 677 if (unlikely(prev->fs | next->fs))
680 loadsegment(fs, next->fs); 678 loadsegment(fs, next->fs);
681 679
682 if (prev->gs | next->gs) 680 write_pda(pcurrent, next_p);
683 loadsegment(gs, next->gs);
684
685 /*
686 * Restore IOPL if needed.
687 */
688 if (unlikely(prev->iopl != next->iopl))
689 set_iopl_mask(next->iopl);
690 681
691 /* 682 /*
692 * Now maybe handle debug registers and/or IO bitmaps 683 * Now maybe handle debug registers and/or IO bitmaps
@@ -697,6 +688,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
697 688
698 disable_tsc(prev_p, next_p); 689 disable_tsc(prev_p, next_p);
699 690
691 /* If the task has used fpu the last 5 timeslices, just do a full
692 * restore of the math state immediately to avoid the trap; the
693 * chances of needing FPU soon are obviously high now
694 */
695 if (next_p->fpu_counter > 5)
696 math_state_restore();
697
700 return prev_p; 698 return prev_p;
701} 699}
702 700
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 775f50e9395b..af8aabe85800 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -45,7 +45,7 @@
45/* 45/*
46 * Offset of eflags on child stack.. 46 * Offset of eflags on child stack..
47 */ 47 */
48#define EFL_OFFSET ((EFL-2)*4-sizeof(struct pt_regs)) 48#define EFL_OFFSET offsetof(struct pt_regs, eflags)
49 49
50static inline struct pt_regs *get_child_regs(struct task_struct *task) 50static inline struct pt_regs *get_child_regs(struct task_struct *task)
51{ 51{
@@ -54,24 +54,24 @@ static inline struct pt_regs *get_child_regs(struct task_struct *task)
54} 54}
55 55
56/* 56/*
57 * this routine will get a word off of the processes privileged stack. 57 * This routine will get a word off of the processes privileged stack.
58 * the offset is how far from the base addr as stored in the TSS. 58 * the offset is bytes into the pt_regs structure on the stack.
59 * this routine assumes that all the privileged stacks are in our 59 * This routine assumes that all the privileged stacks are in our
60 * data space. 60 * data space.
61 */ 61 */
62static inline int get_stack_long(struct task_struct *task, int offset) 62static inline int get_stack_long(struct task_struct *task, int offset)
63{ 63{
64 unsigned char *stack; 64 unsigned char *stack;
65 65
66 stack = (unsigned char *)task->thread.esp0; 66 stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
67 stack += offset; 67 stack += offset;
68 return (*((int *)stack)); 68 return (*((int *)stack));
69} 69}
70 70
71/* 71/*
72 * this routine will put a word on the processes privileged stack. 72 * This routine will put a word on the processes privileged stack.
73 * the offset is how far from the base addr as stored in the TSS. 73 * the offset is bytes into the pt_regs structure on the stack.
74 * this routine assumes that all the privileged stacks are in our 74 * This routine assumes that all the privileged stacks are in our
75 * data space. 75 * data space.
76 */ 76 */
77static inline int put_stack_long(struct task_struct *task, int offset, 77static inline int put_stack_long(struct task_struct *task, int offset,
@@ -79,7 +79,7 @@ static inline int put_stack_long(struct task_struct *task, int offset,
79{ 79{
80 unsigned char * stack; 80 unsigned char * stack;
81 81
82 stack = (unsigned char *) task->thread.esp0; 82 stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs);
83 stack += offset; 83 stack += offset;
84 *(unsigned long *) stack = data; 84 *(unsigned long *) stack = data;
85 return 0; 85 return 0;
@@ -94,13 +94,9 @@ static int putreg(struct task_struct *child,
94 return -EIO; 94 return -EIO;
95 child->thread.fs = value; 95 child->thread.fs = value;
96 return 0; 96 return 0;
97 case GS:
98 if (value && (value & 3) != 3)
99 return -EIO;
100 child->thread.gs = value;
101 return 0;
102 case DS: 97 case DS:
103 case ES: 98 case ES:
99 case GS:
104 if (value && (value & 3) != 3) 100 if (value && (value & 3) != 3)
105 return -EIO; 101 return -EIO;
106 value &= 0xffff; 102 value &= 0xffff;
@@ -116,9 +112,9 @@ static int putreg(struct task_struct *child,
116 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; 112 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
117 break; 113 break;
118 } 114 }
119 if (regno > GS*4) 115 if (regno > ES*4)
120 regno -= 2*4; 116 regno -= 1*4;
121 put_stack_long(child, regno - sizeof(struct pt_regs), value); 117 put_stack_long(child, regno, value);
122 return 0; 118 return 0;
123} 119}
124 120
@@ -131,19 +127,16 @@ static unsigned long getreg(struct task_struct *child,
131 case FS: 127 case FS:
132 retval = child->thread.fs; 128 retval = child->thread.fs;
133 break; 129 break;
134 case GS:
135 retval = child->thread.gs;
136 break;
137 case DS: 130 case DS:
138 case ES: 131 case ES:
132 case GS:
139 case SS: 133 case SS:
140 case CS: 134 case CS:
141 retval = 0xffff; 135 retval = 0xffff;
142 /* fall through */ 136 /* fall through */
143 default: 137 default:
144 if (regno > GS*4) 138 if (regno > ES*4)
145 regno -= 2*4; 139 regno -= 1*4;
146 regno = regno - sizeof(struct pt_regs);
147 retval &= get_stack_long(child, regno); 140 retval &= get_stack_long(child, regno);
148 } 141 }
149 return retval; 142 return retval;
diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c
index 9f6ab1789bb0..34874c398b44 100644
--- a/arch/i386/kernel/quirks.c
+++ b/arch/i386/kernel/quirks.c
@@ -3,10 +3,12 @@
3 */ 3 */
4#include <linux/pci.h> 4#include <linux/pci.h>
5#include <linux/irq.h> 5#include <linux/irq.h>
6#include <asm/pci-direct.h>
7#include <asm/genapic.h>
8#include <asm/cpu.h>
6 9
7#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) 10#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
8 11static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev)
9static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
10{ 12{
11 u8 config, rev; 13 u8 config, rev;
12 u32 word; 14 u32 word;
@@ -14,14 +16,12 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
14 /* BIOS may enable hardware IRQ balancing for 16 /* BIOS may enable hardware IRQ balancing for
15 * E7520/E7320/E7525(revision ID 0x9 and below) 17 * E7520/E7320/E7525(revision ID 0x9 and below)
16 * based platforms. 18 * based platforms.
17 * Disable SW irqbalance/affinity on those platforms. 19 * For those platforms, make sure that the genapic is set to 'flat'
18 */ 20 */
19 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); 21 pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
20 if (rev > 0x9) 22 if (rev > 0x9)
21 return; 23 return;
22 24
23 printk(KERN_INFO "Intel E7520/7320/7525 detected.");
24
25 /* enable access to config space*/ 25 /* enable access to config space*/
26 pci_read_config_byte(dev, 0xf4, &config); 26 pci_read_config_byte(dev, 0xf4, &config);
27 pci_write_config_byte(dev, 0xf4, config|0x2); 27 pci_write_config_byte(dev, 0xf4, config|0x2);
@@ -30,6 +30,44 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); 30 raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
31 31
32 if (!(word & (1 << 13))) { 32 if (!(word & (1 << 13))) {
33#ifdef CONFIG_X86_64
34 if (genapic != &apic_flat)
35 panic("APIC mode must be flat on this system\n");
36#elif defined(CONFIG_X86_GENERICARCH)
37 if (genapic != &apic_default)
38 panic("APIC mode must be default(flat) on this system. Use apic=default\n");
39#endif
40 }
41
42 /* put back the original value for config space*/
43 if (!(config & 0x2))
44 pci_write_config_byte(dev, 0xf4, config);
45}
46
47void __init quirk_intel_irqbalance(void)
48{
49 u8 config, rev;
50 u32 word;
51
52 /* BIOS may enable hardware IRQ balancing for
53 * E7520/E7320/E7525(revision ID 0x9 and below)
54 * based platforms.
55 * Disable SW irqbalance/affinity on those platforms.
56 */
57 rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION);
58 if (rev > 0x9)
59 return;
60
61 printk(KERN_INFO "Intel E7520/7320/7525 detected.");
62
63 /* enable access to config space */
64 config = read_pci_config_byte(0, 0, 0, 0xf4);
65 write_pci_config_byte(0, 0, 0, 0xf4, config|0x2);
66
67 /* read xTPR register */
68 word = read_pci_config_16(0, 0, 0x40, 0x4c);
69
70 if (!(word & (1 << 13))) {
33 printk(KERN_INFO "Disabling irq balancing and affinity\n"); 71 printk(KERN_INFO "Disabling irq balancing and affinity\n");
34#ifdef CONFIG_IRQBALANCE 72#ifdef CONFIG_IRQBALANCE
35 irqbalance_disable(""); 73 irqbalance_disable("");
@@ -38,13 +76,24 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
38#ifdef CONFIG_PROC_FS 76#ifdef CONFIG_PROC_FS
39 no_irq_affinity = 1; 77 no_irq_affinity = 1;
40#endif 78#endif
79#ifdef CONFIG_HOTPLUG_CPU
80 printk(KERN_INFO "Disabling cpu hotplug control\n");
81 enable_cpu_hotplug = 0;
82#endif
83#ifdef CONFIG_X86_64
84 /* force the genapic selection to flat mode so that
85 * interrupts can be redirected to more than one CPU.
86 */
87 genapic_force = &apic_flat;
88#endif
41 } 89 }
42 90
43 /* put back the original value for config space*/ 91 /* put back the original value for config space */
44 if (!(config & 0x2)) 92 if (!(config & 0x2))
45 pci_write_config_byte(dev, 0xf4, config); 93 write_pci_config_byte(0, 0, 0, 0xf4, config);
46} 94}
47DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); 95DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance);
48DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); 96DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance);
49DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); 97DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance);
98
50#endif 99#endif
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index 84278e0093a2..3514b4153f7f 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -12,6 +12,7 @@
12#include <linux/dmi.h> 12#include <linux/dmi.h>
13#include <linux/ctype.h> 13#include <linux/ctype.h>
14#include <linux/pm.h> 14#include <linux/pm.h>
15#include <linux/reboot.h>
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16#include <asm/apic.h> 17#include <asm/apic.h>
17#include <asm/desc.h> 18#include <asm/desc.h>
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 141041dde74d..4b31ad70c1ac 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -63,9 +63,6 @@
63#include <setup_arch.h> 63#include <setup_arch.h>
64#include <bios_ebda.h> 64#include <bios_ebda.h>
65 65
66/* Forward Declaration. */
67void __init find_max_pfn(void);
68
69/* This value is set up by the early boot code to point to the value 66/* This value is set up by the early boot code to point to the value
70 immediately after the boot time page tables. It contains a *physical* 67 immediately after the boot time page tables. It contains a *physical*
71 address, and must not be in the .bss segment! */ 68 address, and must not be in the .bss segment! */
@@ -76,14 +73,11 @@ int disable_pse __devinitdata = 0;
76/* 73/*
77 * Machine setup.. 74 * Machine setup..
78 */ 75 */
79 76extern struct resource code_resource;
80#ifdef CONFIG_EFI 77extern struct resource data_resource;
81int efi_enabled = 0;
82EXPORT_SYMBOL(efi_enabled);
83#endif
84 78
85/* cpu data as detected by the assembly code in head.S */ 79/* cpu data as detected by the assembly code in head.S */
86struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; 80struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
87/* common cpu data for all cpus */ 81/* common cpu data for all cpus */
88struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; 82struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
89EXPORT_SYMBOL(boot_cpu_data); 83EXPORT_SYMBOL(boot_cpu_data);
@@ -99,12 +93,6 @@ unsigned int machine_submodel_id;
99unsigned int BIOS_revision; 93unsigned int BIOS_revision;
100unsigned int mca_pentium_flag; 94unsigned int mca_pentium_flag;
101 95
102/* For PCI or other memory-mapped resources */
103unsigned long pci_mem_start = 0x10000000;
104#ifdef CONFIG_PCI
105EXPORT_SYMBOL(pci_mem_start);
106#endif
107
108/* Boot loader ID as an integer, for the benefit of proc_dointvec */ 96/* Boot loader ID as an integer, for the benefit of proc_dointvec */
109int bootloader_type; 97int bootloader_type;
110 98
@@ -134,7 +122,6 @@ struct ist_info ist_info;
134 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) 122 defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
135EXPORT_SYMBOL(ist_info); 123EXPORT_SYMBOL(ist_info);
136#endif 124#endif
137struct e820map e820;
138 125
139extern void early_cpu_init(void); 126extern void early_cpu_init(void);
140extern int root_mountflags; 127extern int root_mountflags;
@@ -149,516 +136,6 @@ static char command_line[COMMAND_LINE_SIZE];
149 136
150unsigned char __initdata boot_params[PARAM_SIZE]; 137unsigned char __initdata boot_params[PARAM_SIZE];
151 138
152static struct resource data_resource = {
153 .name = "Kernel data",
154 .start = 0,
155 .end = 0,
156 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
157};
158
159static struct resource code_resource = {
160 .name = "Kernel code",
161 .start = 0,
162 .end = 0,
163 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
164};
165
166static struct resource system_rom_resource = {
167 .name = "System ROM",
168 .start = 0xf0000,
169 .end = 0xfffff,
170 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
171};
172
173static struct resource extension_rom_resource = {
174 .name = "Extension ROM",
175 .start = 0xe0000,
176 .end = 0xeffff,
177 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
178};
179
180static struct resource adapter_rom_resources[] = { {
181 .name = "Adapter ROM",
182 .start = 0xc8000,
183 .end = 0,
184 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
185}, {
186 .name = "Adapter ROM",
187 .start = 0,
188 .end = 0,
189 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
190}, {
191 .name = "Adapter ROM",
192 .start = 0,
193 .end = 0,
194 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
195}, {
196 .name = "Adapter ROM",
197 .start = 0,
198 .end = 0,
199 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
200}, {
201 .name = "Adapter ROM",
202 .start = 0,
203 .end = 0,
204 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
205}, {
206 .name = "Adapter ROM",
207 .start = 0,
208 .end = 0,
209 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
210} };
211
212static struct resource video_rom_resource = {
213 .name = "Video ROM",
214 .start = 0xc0000,
215 .end = 0xc7fff,
216 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
217};
218
219static struct resource video_ram_resource = {
220 .name = "Video RAM area",
221 .start = 0xa0000,
222 .end = 0xbffff,
223 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
224};
225
226static struct resource standard_io_resources[] = { {
227 .name = "dma1",
228 .start = 0x0000,
229 .end = 0x001f,
230 .flags = IORESOURCE_BUSY | IORESOURCE_IO
231}, {
232 .name = "pic1",
233 .start = 0x0020,
234 .end = 0x0021,
235 .flags = IORESOURCE_BUSY | IORESOURCE_IO
236}, {
237 .name = "timer0",
238 .start = 0x0040,
239 .end = 0x0043,
240 .flags = IORESOURCE_BUSY | IORESOURCE_IO
241}, {
242 .name = "timer1",
243 .start = 0x0050,
244 .end = 0x0053,
245 .flags = IORESOURCE_BUSY | IORESOURCE_IO
246}, {
247 .name = "keyboard",
248 .start = 0x0060,
249 .end = 0x006f,
250 .flags = IORESOURCE_BUSY | IORESOURCE_IO
251}, {
252 .name = "dma page reg",
253 .start = 0x0080,
254 .end = 0x008f,
255 .flags = IORESOURCE_BUSY | IORESOURCE_IO
256}, {
257 .name = "pic2",
258 .start = 0x00a0,
259 .end = 0x00a1,
260 .flags = IORESOURCE_BUSY | IORESOURCE_IO
261}, {
262 .name = "dma2",
263 .start = 0x00c0,
264 .end = 0x00df,
265 .flags = IORESOURCE_BUSY | IORESOURCE_IO
266}, {
267 .name = "fpu",
268 .start = 0x00f0,
269 .end = 0x00ff,
270 .flags = IORESOURCE_BUSY | IORESOURCE_IO
271} };
272
273#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
274
275static int __init romchecksum(unsigned char *rom, unsigned long length)
276{
277 unsigned char *p, sum = 0;
278
279 for (p = rom; p < rom + length; p++)
280 sum += *p;
281 return sum == 0;
282}
283
284static void __init probe_roms(void)
285{
286 unsigned long start, length, upper;
287 unsigned char *rom;
288 int i;
289
290 /* video rom */
291 upper = adapter_rom_resources[0].start;
292 for (start = video_rom_resource.start; start < upper; start += 2048) {
293 rom = isa_bus_to_virt(start);
294 if (!romsignature(rom))
295 continue;
296
297 video_rom_resource.start = start;
298
299 /* 0 < length <= 0x7f * 512, historically */
300 length = rom[2] * 512;
301
302 /* if checksum okay, trust length byte */
303 if (length && romchecksum(rom, length))
304 video_rom_resource.end = start + length - 1;
305
306 request_resource(&iomem_resource, &video_rom_resource);
307 break;
308 }
309
310 start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
311 if (start < upper)
312 start = upper;
313
314 /* system rom */
315 request_resource(&iomem_resource, &system_rom_resource);
316 upper = system_rom_resource.start;
317
318 /* check for extension rom (ignore length byte!) */
319 rom = isa_bus_to_virt(extension_rom_resource.start);
320 if (romsignature(rom)) {
321 length = extension_rom_resource.end - extension_rom_resource.start + 1;
322 if (romchecksum(rom, length)) {
323 request_resource(&iomem_resource, &extension_rom_resource);
324 upper = extension_rom_resource.start;
325 }
326 }
327
328 /* check for adapter roms on 2k boundaries */
329 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
330 rom = isa_bus_to_virt(start);
331 if (!romsignature(rom))
332 continue;
333
334 /* 0 < length <= 0x7f * 512, historically */
335 length = rom[2] * 512;
336
337 /* but accept any length that fits if checksum okay */
338 if (!length || start + length > upper || !romchecksum(rom, length))
339 continue;
340
341 adapter_rom_resources[i].start = start;
342 adapter_rom_resources[i].end = start + length - 1;
343 request_resource(&iomem_resource, &adapter_rom_resources[i]);
344
345 start = adapter_rom_resources[i++].end & ~2047UL;
346 }
347}
348
349static void __init limit_regions(unsigned long long size)
350{
351 unsigned long long current_addr = 0;
352 int i;
353
354 if (efi_enabled) {
355 efi_memory_desc_t *md;
356 void *p;
357
358 for (p = memmap.map, i = 0; p < memmap.map_end;
359 p += memmap.desc_size, i++) {
360 md = p;
361 current_addr = md->phys_addr + (md->num_pages << 12);
362 if (md->type == EFI_CONVENTIONAL_MEMORY) {
363 if (current_addr >= size) {
364 md->num_pages -=
365 (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
366 memmap.nr_map = i + 1;
367 return;
368 }
369 }
370 }
371 }
372 for (i = 0; i < e820.nr_map; i++) {
373 current_addr = e820.map[i].addr + e820.map[i].size;
374 if (current_addr < size)
375 continue;
376
377 if (e820.map[i].type != E820_RAM)
378 continue;
379
380 if (e820.map[i].addr >= size) {
381 /*
382 * This region starts past the end of the
383 * requested size, skip it completely.
384 */
385 e820.nr_map = i;
386 } else {
387 e820.nr_map = i + 1;
388 e820.map[i].size -= current_addr - size;
389 }
390 return;
391 }
392}
393
394void __init add_memory_region(unsigned long long start,
395 unsigned long long size, int type)
396{
397 int x;
398
399 if (!efi_enabled) {
400 x = e820.nr_map;
401
402 if (x == E820MAX) {
403 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
404 return;
405 }
406
407 e820.map[x].addr = start;
408 e820.map[x].size = size;
409 e820.map[x].type = type;
410 e820.nr_map++;
411 }
412} /* add_memory_region */
413
414#define E820_DEBUG 1
415
416static void __init print_memory_map(char *who)
417{
418 int i;
419
420 for (i = 0; i < e820.nr_map; i++) {
421 printk(" %s: %016Lx - %016Lx ", who,
422 e820.map[i].addr,
423 e820.map[i].addr + e820.map[i].size);
424 switch (e820.map[i].type) {
425 case E820_RAM: printk("(usable)\n");
426 break;
427 case E820_RESERVED:
428 printk("(reserved)\n");
429 break;
430 case E820_ACPI:
431 printk("(ACPI data)\n");
432 break;
433 case E820_NVS:
434 printk("(ACPI NVS)\n");
435 break;
436 default: printk("type %lu\n", e820.map[i].type);
437 break;
438 }
439 }
440}
441
442/*
443 * Sanitize the BIOS e820 map.
444 *
445 * Some e820 responses include overlapping entries. The following
446 * replaces the original e820 map with a new one, removing overlaps.
447 *
448 */
449struct change_member {
450 struct e820entry *pbios; /* pointer to original bios entry */
451 unsigned long long addr; /* address for this change point */
452};
453static struct change_member change_point_list[2*E820MAX] __initdata;
454static struct change_member *change_point[2*E820MAX] __initdata;
455static struct e820entry *overlap_list[E820MAX] __initdata;
456static struct e820entry new_bios[E820MAX] __initdata;
457
458int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
459{
460 struct change_member *change_tmp;
461 unsigned long current_type, last_type;
462 unsigned long long last_addr;
463 int chgidx, still_changing;
464 int overlap_entries;
465 int new_bios_entry;
466 int old_nr, new_nr, chg_nr;
467 int i;
468
469 /*
470 Visually we're performing the following (1,2,3,4 = memory types)...
471
472 Sample memory map (w/overlaps):
473 ____22__________________
474 ______________________4_
475 ____1111________________
476 _44_____________________
477 11111111________________
478 ____________________33__
479 ___________44___________
480 __________33333_________
481 ______________22________
482 ___________________2222_
483 _________111111111______
484 _____________________11_
485 _________________4______
486
487 Sanitized equivalent (no overlap):
488 1_______________________
489 _44_____________________
490 ___1____________________
491 ____22__________________
492 ______11________________
493 _________1______________
494 __________3_____________
495 ___________44___________
496 _____________33_________
497 _______________2________
498 ________________1_______
499 _________________4______
500 ___________________2____
501 ____________________33__
502 ______________________4_
503 */
504
505 /* if there's only one memory region, don't bother */
506 if (*pnr_map < 2)
507 return -1;
508
509 old_nr = *pnr_map;
510
511 /* bail out if we find any unreasonable addresses in bios map */
512 for (i=0; i<old_nr; i++)
513 if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
514 return -1;
515
516 /* create pointers for initial change-point information (for sorting) */
517 for (i=0; i < 2*old_nr; i++)
518 change_point[i] = &change_point_list[i];
519
520 /* record all known change-points (starting and ending addresses),
521 omitting those that are for empty memory regions */
522 chgidx = 0;
523 for (i=0; i < old_nr; i++) {
524 if (biosmap[i].size != 0) {
525 change_point[chgidx]->addr = biosmap[i].addr;
526 change_point[chgidx++]->pbios = &biosmap[i];
527 change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size;
528 change_point[chgidx++]->pbios = &biosmap[i];
529 }
530 }
531 chg_nr = chgidx; /* true number of change-points */
532
533 /* sort change-point list by memory addresses (low -> high) */
534 still_changing = 1;
535 while (still_changing) {
536 still_changing = 0;
537 for (i=1; i < chg_nr; i++) {
538 /* if <current_addr> > <last_addr>, swap */
539 /* or, if current=<start_addr> & last=<end_addr>, swap */
540 if ((change_point[i]->addr < change_point[i-1]->addr) ||
541 ((change_point[i]->addr == change_point[i-1]->addr) &&
542 (change_point[i]->addr == change_point[i]->pbios->addr) &&
543 (change_point[i-1]->addr != change_point[i-1]->pbios->addr))
544 )
545 {
546 change_tmp = change_point[i];
547 change_point[i] = change_point[i-1];
548 change_point[i-1] = change_tmp;
549 still_changing=1;
550 }
551 }
552 }
553
554 /* create a new bios memory map, removing overlaps */
555 overlap_entries=0; /* number of entries in the overlap table */
556 new_bios_entry=0; /* index for creating new bios map entries */
557 last_type = 0; /* start with undefined memory type */
558 last_addr = 0; /* start with 0 as last starting address */
559 /* loop through change-points, determining affect on the new bios map */
560 for (chgidx=0; chgidx < chg_nr; chgidx++)
561 {
562 /* keep track of all overlapping bios entries */
563 if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr)
564 {
565 /* add map entry to overlap list (> 1 entry implies an overlap) */
566 overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
567 }
568 else
569 {
570 /* remove entry from list (order independent, so swap with last) */
571 for (i=0; i<overlap_entries; i++)
572 {
573 if (overlap_list[i] == change_point[chgidx]->pbios)
574 overlap_list[i] = overlap_list[overlap_entries-1];
575 }
576 overlap_entries--;
577 }
578 /* if there are overlapping entries, decide which "type" to use */
579 /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */
580 current_type = 0;
581 for (i=0; i<overlap_entries; i++)
582 if (overlap_list[i]->type > current_type)
583 current_type = overlap_list[i]->type;
584 /* continue building up new bios map based on this information */
585 if (current_type != last_type) {
586 if (last_type != 0) {
587 new_bios[new_bios_entry].size =
588 change_point[chgidx]->addr - last_addr;
589 /* move forward only if the new size was non-zero */
590 if (new_bios[new_bios_entry].size != 0)
591 if (++new_bios_entry >= E820MAX)
592 break; /* no more space left for new bios entries */
593 }
594 if (current_type != 0) {
595 new_bios[new_bios_entry].addr = change_point[chgidx]->addr;
596 new_bios[new_bios_entry].type = current_type;
597 last_addr=change_point[chgidx]->addr;
598 }
599 last_type = current_type;
600 }
601 }
602 new_nr = new_bios_entry; /* retain count for new bios entries */
603
604 /* copy new bios mapping into original location */
605 memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
606 *pnr_map = new_nr;
607
608 return 0;
609}
610
611/*
612 * Copy the BIOS e820 map into a safe place.
613 *
614 * Sanity-check it while we're at it..
615 *
616 * If we're lucky and live on a modern system, the setup code
617 * will have given us a memory map that we can use to properly
618 * set up memory. If we aren't, we'll fake a memory map.
619 *
620 * We check to see that the memory map contains at least 2 elements
621 * before we'll use it, because the detection code in setup.S may
622 * not be perfect and most every PC known to man has two memory
623 * regions: one from 0 to 640k, and one from 1mb up. (The IBM
624 * thinkpad 560x, for example, does not cooperate with the memory
625 * detection code.)
626 */
627int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
628{
629 /* Only one memory region (or negative)? Ignore it */
630 if (nr_map < 2)
631 return -1;
632
633 do {
634 unsigned long long start = biosmap->addr;
635 unsigned long long size = biosmap->size;
636 unsigned long long end = start + size;
637 unsigned long type = biosmap->type;
638
639 /* Overflow in 64 bits? Ignore the memory map. */
640 if (start > end)
641 return -1;
642
643 /*
644 * Some BIOSes claim RAM in the 640k - 1M region.
645 * Not right. Fix it up.
646 */
647 if (type == E820_RAM) {
648 if (start < 0x100000ULL && end > 0xA0000ULL) {
649 if (start < 0xA0000ULL)
650 add_memory_region(start, 0xA0000ULL-start, type);
651 if (end <= 0x100000ULL)
652 continue;
653 start = 0x100000ULL;
654 size = end - start;
655 }
656 }
657 add_memory_region(start, size, type);
658 } while (biosmap++,--nr_map);
659 return 0;
660}
661
662#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 139#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
663struct edd edd; 140struct edd edd;
664#ifdef CONFIG_EDD_MODULE 141#ifdef CONFIG_EDD_MODULE
@@ -682,7 +159,7 @@ static inline void copy_edd(void)
682} 159}
683#endif 160#endif
684 161
685static int __initdata user_defined_memmap = 0; 162int __initdata user_defined_memmap = 0;
686 163
687/* 164/*
688 * "mem=nopentium" disables the 4MB page tables. 165 * "mem=nopentium" disables the 4MB page tables.
@@ -719,51 +196,6 @@ static int __init parse_mem(char *arg)
719} 196}
720early_param("mem", parse_mem); 197early_param("mem", parse_mem);
721 198
722static int __init parse_memmap(char *arg)
723{
724 if (!arg)
725 return -EINVAL;
726
727 if (strcmp(arg, "exactmap") == 0) {
728#ifdef CONFIG_CRASH_DUMP
729 /* If we are doing a crash dump, we
730 * still need to know the real mem
731 * size before original memory map is
732 * reset.
733 */
734 find_max_pfn();
735 saved_max_pfn = max_pfn;
736#endif
737 e820.nr_map = 0;
738 user_defined_memmap = 1;
739 } else {
740 /* If the user specifies memory size, we
741 * limit the BIOS-provided memory map to
742 * that size. exactmap can be used to specify
743 * the exact map. mem=number can be used to
744 * trim the existing memory map.
745 */
746 unsigned long long start_at, mem_size;
747
748 mem_size = memparse(arg, &arg);
749 if (*arg == '@') {
750 start_at = memparse(arg+1, &arg);
751 add_memory_region(start_at, mem_size, E820_RAM);
752 } else if (*arg == '#') {
753 start_at = memparse(arg+1, &arg);
754 add_memory_region(start_at, mem_size, E820_ACPI);
755 } else if (*arg == '$') {
756 start_at = memparse(arg+1, &arg);
757 add_memory_region(start_at, mem_size, E820_RESERVED);
758 } else {
759 limit_regions(mem_size);
760 user_defined_memmap = 1;
761 }
762 }
763 return 0;
764}
765early_param("memmap", parse_memmap);
766
767#ifdef CONFIG_PROC_VMCORE 199#ifdef CONFIG_PROC_VMCORE
768/* elfcorehdr= specifies the location of elf core header 200/* elfcorehdr= specifies the location of elf core header
769 * stored by the crashed kernel. 201 * stored by the crashed kernel.
@@ -828,90 +260,6 @@ static int __init parse_reservetop(char *arg)
828early_param("reservetop", parse_reservetop); 260early_param("reservetop", parse_reservetop);
829 261
830/* 262/*
831 * Callback for efi_memory_walk.
832 */
833static int __init
834efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
835{
836 unsigned long *max_pfn = arg, pfn;
837
838 if (start < end) {
839 pfn = PFN_UP(end -1);
840 if (pfn > *max_pfn)
841 *max_pfn = pfn;
842 }
843 return 0;
844}
845
846static int __init
847efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg)
848{
849 memory_present(0, PFN_UP(start), PFN_DOWN(end));
850 return 0;
851}
852
853 /*
854 * This function checks if the entire range <start,end> is mapped with type.
855 *
856 * Note: this function only works correct if the e820 table is sorted and
857 * not-overlapping, which is the case
858 */
859int __init
860e820_all_mapped(unsigned long s, unsigned long e, unsigned type)
861{
862 u64 start = s;
863 u64 end = e;
864 int i;
865 for (i = 0; i < e820.nr_map; i++) {
866 struct e820entry *ei = &e820.map[i];
867 if (type && ei->type != type)
868 continue;
869 /* is the region (part) in overlap with the current region ?*/
870 if (ei->addr >= end || ei->addr + ei->size <= start)
871 continue;
872 /* if the region is at the beginning of <start,end> we move
873 * start to the end of the region since it's ok until there
874 */
875 if (ei->addr <= start)
876 start = ei->addr + ei->size;
877 /* if start is now at or beyond end, we're done, full
878 * coverage */
879 if (start >= end)
880 return 1; /* we're done */
881 }
882 return 0;
883}
884
885/*
886 * Find the highest page frame number we have available
887 */
888void __init find_max_pfn(void)
889{
890 int i;
891
892 max_pfn = 0;
893 if (efi_enabled) {
894 efi_memmap_walk(efi_find_max_pfn, &max_pfn);
895 efi_memmap_walk(efi_memory_present_wrapper, NULL);
896 return;
897 }
898
899 for (i = 0; i < e820.nr_map; i++) {
900 unsigned long start, end;
901 /* RAM? */
902 if (e820.map[i].type != E820_RAM)
903 continue;
904 start = PFN_UP(e820.map[i].addr);
905 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
906 if (start >= end)
907 continue;
908 if (end > max_pfn)
909 max_pfn = end;
910 memory_present(0, start, end);
911 }
912}
913
914/*
915 * Determine low and high memory ranges: 263 * Determine low and high memory ranges:
916 */ 264 */
917unsigned long __init find_max_low_pfn(void) 265unsigned long __init find_max_low_pfn(void)
@@ -971,68 +319,6 @@ unsigned long __init find_max_low_pfn(void)
971} 319}
972 320
973/* 321/*
974 * Free all available memory for boot time allocation. Used
975 * as a callback function by efi_memory_walk()
976 */
977
978static int __init
979free_available_memory(unsigned long start, unsigned long end, void *arg)
980{
981 /* check max_low_pfn */
982 if (start >= (max_low_pfn << PAGE_SHIFT))
983 return 0;
984 if (end >= (max_low_pfn << PAGE_SHIFT))
985 end = max_low_pfn << PAGE_SHIFT;
986 if (start < end)
987 free_bootmem(start, end - start);
988
989 return 0;
990}
991/*
992 * Register fully available low RAM pages with the bootmem allocator.
993 */
994static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
995{
996 int i;
997
998 if (efi_enabled) {
999 efi_memmap_walk(free_available_memory, NULL);
1000 return;
1001 }
1002 for (i = 0; i < e820.nr_map; i++) {
1003 unsigned long curr_pfn, last_pfn, size;
1004 /*
1005 * Reserve usable low memory
1006 */
1007 if (e820.map[i].type != E820_RAM)
1008 continue;
1009 /*
1010 * We are rounding up the start address of usable memory:
1011 */
1012 curr_pfn = PFN_UP(e820.map[i].addr);
1013 if (curr_pfn >= max_low_pfn)
1014 continue;
1015 /*
1016 * ... and at the end of the usable range downwards:
1017 */
1018 last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
1019
1020 if (last_pfn > max_low_pfn)
1021 last_pfn = max_low_pfn;
1022
1023 /*
1024 * .. finally, did all the rounding and playing
1025 * around just make the area go away?
1026 */
1027 if (last_pfn <= curr_pfn)
1028 continue;
1029
1030 size = last_pfn - curr_pfn;
1031 free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
1032 }
1033}
1034
1035/*
1036 * workaround for Dell systems that neglect to reserve EBDA 322 * workaround for Dell systems that neglect to reserve EBDA
1037 */ 323 */
1038static void __init reserve_ebda_region(void) 324static void __init reserve_ebda_region(void)
@@ -1118,8 +404,8 @@ void __init setup_bootmem_allocator(void)
1118 * the (very unlikely) case of us accidentally initializing the 404 * the (very unlikely) case of us accidentally initializing the
1119 * bootmem allocator with an invalid RAM area. 405 * bootmem allocator with an invalid RAM area.
1120 */ 406 */
1121 reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) + 407 reserve_bootmem(__pa_symbol(_text), (PFN_PHYS(min_low_pfn) +
1122 bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START)); 408 bootmap_size + PAGE_SIZE-1) - __pa_symbol(_text));
1123 409
1124 /* 410 /*
1125 * reserve physical page 0 - it's a special BIOS page on many boxes, 411 * reserve physical page 0 - it's a special BIOS page on many boxes,
@@ -1162,8 +448,7 @@ void __init setup_bootmem_allocator(void)
1162 if (LOADER_TYPE && INITRD_START) { 448 if (LOADER_TYPE && INITRD_START) {
1163 if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { 449 if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
1164 reserve_bootmem(INITRD_START, INITRD_SIZE); 450 reserve_bootmem(INITRD_START, INITRD_SIZE);
1165 initrd_start = 451 initrd_start = INITRD_START + PAGE_OFFSET;
1166 INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
1167 initrd_end = initrd_start+INITRD_SIZE; 452 initrd_end = initrd_start+INITRD_SIZE;
1168 } 453 }
1169 else { 454 else {
@@ -1200,126 +485,6 @@ void __init remapped_pgdat_init(void)
1200 } 485 }
1201} 486}
1202 487
1203/*
1204 * Request address space for all standard RAM and ROM resources
1205 * and also for regions reported as reserved by the e820.
1206 */
1207static void __init
1208legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource)
1209{
1210 int i;
1211
1212 probe_roms();
1213 for (i = 0; i < e820.nr_map; i++) {
1214 struct resource *res;
1215#ifndef CONFIG_RESOURCES_64BIT
1216 if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
1217 continue;
1218#endif
1219 res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
1220 switch (e820.map[i].type) {
1221 case E820_RAM: res->name = "System RAM"; break;
1222 case E820_ACPI: res->name = "ACPI Tables"; break;
1223 case E820_NVS: res->name = "ACPI Non-volatile Storage"; break;
1224 default: res->name = "reserved";
1225 }
1226 res->start = e820.map[i].addr;
1227 res->end = res->start + e820.map[i].size - 1;
1228 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1229 if (request_resource(&iomem_resource, res)) {
1230 kfree(res);
1231 continue;
1232 }
1233 if (e820.map[i].type == E820_RAM) {
1234 /*
1235 * We don't know which RAM region contains kernel data,
1236 * so we try it repeatedly and let the resource manager
1237 * test it.
1238 */
1239 request_resource(res, code_resource);
1240 request_resource(res, data_resource);
1241#ifdef CONFIG_KEXEC
1242 request_resource(res, &crashk_res);
1243#endif
1244 }
1245 }
1246}
1247
1248/*
1249 * Request address space for all standard resources
1250 *
1251 * This is called just before pcibios_init(), which is also a
1252 * subsys_initcall, but is linked in later (in arch/i386/pci/common.c).
1253 */
1254static int __init request_standard_resources(void)
1255{
1256 int i;
1257
1258 printk("Setting up standard PCI resources\n");
1259 if (efi_enabled)
1260 efi_initialize_iomem_resources(&code_resource, &data_resource);
1261 else
1262 legacy_init_iomem_resources(&code_resource, &data_resource);
1263
1264 /* EFI systems may still have VGA */
1265 request_resource(&iomem_resource, &video_ram_resource);
1266
1267 /* request I/O space for devices used on all i[345]86 PCs */
1268 for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++)
1269 request_resource(&ioport_resource, &standard_io_resources[i]);
1270 return 0;
1271}
1272
1273subsys_initcall(request_standard_resources);
1274
1275static void __init register_memory(void)
1276{
1277 unsigned long gapstart, gapsize, round;
1278 unsigned long long last;
1279 int i;
1280
1281 /*
1282 * Search for the bigest gap in the low 32 bits of the e820
1283 * memory space.
1284 */
1285 last = 0x100000000ull;
1286 gapstart = 0x10000000;
1287 gapsize = 0x400000;
1288 i = e820.nr_map;
1289 while (--i >= 0) {
1290 unsigned long long start = e820.map[i].addr;
1291 unsigned long long end = start + e820.map[i].size;
1292
1293 /*
1294 * Since "last" is at most 4GB, we know we'll
1295 * fit in 32 bits if this condition is true
1296 */
1297 if (last > end) {
1298 unsigned long gap = last - end;
1299
1300 if (gap > gapsize) {
1301 gapsize = gap;
1302 gapstart = end;
1303 }
1304 }
1305 if (start < last)
1306 last = start;
1307 }
1308
1309 /*
1310 * See how much we want to round up: start off with
1311 * rounding to the next 1MB area.
1312 */
1313 round = 0x100000;
1314 while ((gapsize >> 4) > round)
1315 round += round;
1316 /* Fun with two's complement */
1317 pci_mem_start = (gapstart + round) & -round;
1318
1319 printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n",
1320 pci_mem_start, gapstart, gapsize);
1321}
1322
1323#ifdef CONFIG_MCA 488#ifdef CONFIG_MCA
1324static void set_mca_bus(int x) 489static void set_mca_bus(int x)
1325{ 490{
@@ -1329,6 +494,12 @@ static void set_mca_bus(int x)
1329static void set_mca_bus(int x) { } 494static void set_mca_bus(int x) { }
1330#endif 495#endif
1331 496
497/* Overridden in paravirt.c if CONFIG_PARAVIRT */
498char * __init __attribute__((weak)) memory_setup(void)
499{
500 return machine_specific_memory_setup();
501}
502
1332/* 503/*
1333 * Determine if we were loaded by an EFI loader. If so, then we have also been 504 * Determine if we were loaded by an EFI loader. If so, then we have also been
1334 * passed the efi memmap, systab, etc., so we should use these data structures 505 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -1381,7 +552,7 @@ void __init setup_arch(char **cmdline_p)
1381 efi_init(); 552 efi_init();
1382 else { 553 else {
1383 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 554 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1384 print_memory_map(machine_specific_memory_setup()); 555 print_memory_map(memory_setup());
1385 } 556 }
1386 557
1387 copy_edd(); 558 copy_edd();
@@ -1468,7 +639,7 @@ void __init setup_arch(char **cmdline_p)
1468 get_smp_config(); 639 get_smp_config();
1469#endif 640#endif
1470 641
1471 register_memory(); 642 e820_register_memory();
1472 643
1473#ifdef CONFIG_VT 644#ifdef CONFIG_VT
1474#if defined(CONFIG_VGA_CONSOLE) 645#if defined(CONFIG_VGA_CONSOLE)
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 43002cfb40c4..65d7620eaa09 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -128,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ 128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) 129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
130 130
131 GET_SEG(gs); 131 COPY_SEG(gs);
132 GET_SEG(fs); 132 GET_SEG(fs);
133 COPY_SEG(es); 133 COPY_SEG(es);
134 COPY_SEG(ds); 134 COPY_SEG(ds);
@@ -244,9 +244,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
244{ 244{
245 int tmp, err = 0; 245 int tmp, err = 0;
246 246
247 tmp = 0; 247 err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs);
248 savesegment(gs, tmp);
249 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
250 savesegment(fs, tmp); 248 savesegment(fs, tmp);
251 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 249 err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
252 250
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 31e5c6573aae..5285aff8367f 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -321,7 +321,6 @@ static inline void leave_mm (unsigned long cpu)
321 321
322fastcall void smp_invalidate_interrupt(struct pt_regs *regs) 322fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
323{ 323{
324 struct pt_regs *old_regs = set_irq_regs(regs);
325 unsigned long cpu; 324 unsigned long cpu;
326 325
327 cpu = get_cpu(); 326 cpu = get_cpu();
@@ -352,7 +351,6 @@ fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
352 smp_mb__after_clear_bit(); 351 smp_mb__after_clear_bit();
353out: 352out:
354 put_cpu_no_resched(); 353 put_cpu_no_resched();
355 set_irq_regs(old_regs);
356} 354}
357 355
358static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 356static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
@@ -607,14 +605,11 @@ void smp_send_stop(void)
607 */ 605 */
608fastcall void smp_reschedule_interrupt(struct pt_regs *regs) 606fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
609{ 607{
610 struct pt_regs *old_regs = set_irq_regs(regs);
611 ack_APIC_irq(); 608 ack_APIC_irq();
612 set_irq_regs(old_regs);
613} 609}
614 610
615fastcall void smp_call_function_interrupt(struct pt_regs *regs) 611fastcall void smp_call_function_interrupt(struct pt_regs *regs)
616{ 612{
617 struct pt_regs *old_regs = set_irq_regs(regs);
618 void (*func) (void *info) = call_data->func; 613 void (*func) (void *info) = call_data->func;
619 void *info = call_data->info; 614 void *info = call_data->info;
620 int wait = call_data->wait; 615 int wait = call_data->wait;
@@ -637,7 +632,6 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
637 mb(); 632 mb();
638 atomic_inc(&call_data->finished); 633 atomic_inc(&call_data->finished);
639 } 634 }
640 set_irq_regs(old_regs);
641} 635}
642 636
643/* 637/*
@@ -699,6 +693,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
699 put_cpu(); 693 put_cpu();
700 return -EBUSY; 694 return -EBUSY;
701 } 695 }
696
697 /* Can deadlock when called with interrupts disabled */
698 WARN_ON(irqs_disabled());
699
702 spin_lock_bh(&call_lock); 700 spin_lock_bh(&call_lock);
703 __smp_call_function_single(cpu, func, info, nonatomic, wait); 701 __smp_call_function_single(cpu, func, info, nonatomic, wait);
704 spin_unlock_bh(&call_lock); 702 spin_unlock_bh(&call_lock);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 4bb8b77cd65b..dea7ef9d3e82 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -33,6 +33,11 @@
33 * Dave Jones : Report invalid combinations of Athlon CPUs. 33 * Dave Jones : Report invalid combinations of Athlon CPUs.
34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */ 34* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
35 35
36
37/* SMP boot always wants to use real time delay to allow sufficient time for
38 * the APs to come online */
39#define USE_REAL_TIME_DELAY
40
36#include <linux/module.h> 41#include <linux/module.h>
37#include <linux/init.h> 42#include <linux/init.h>
38#include <linux/kernel.h> 43#include <linux/kernel.h>
@@ -52,6 +57,8 @@
52#include <asm/desc.h> 57#include <asm/desc.h>
53#include <asm/arch_hooks.h> 58#include <asm/arch_hooks.h>
54#include <asm/nmi.h> 59#include <asm/nmi.h>
60#include <asm/pda.h>
61#include <asm/genapic.h>
55 62
56#include <mach_apic.h> 63#include <mach_apic.h>
57#include <mach_wakecpu.h> 64#include <mach_wakecpu.h>
@@ -62,9 +69,7 @@ static int __devinitdata smp_b_stepping;
62 69
63/* Number of siblings per CPU package */ 70/* Number of siblings per CPU package */
64int smp_num_siblings = 1; 71int smp_num_siblings = 1;
65#ifdef CONFIG_X86_HT
66EXPORT_SYMBOL(smp_num_siblings); 72EXPORT_SYMBOL(smp_num_siblings);
67#endif
68 73
69/* Last level cache ID of each logical CPU */ 74/* Last level cache ID of each logical CPU */
70int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; 75int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
@@ -154,7 +159,7 @@ void __init smp_alloc_memory(void)
154 * a given CPU 159 * a given CPU
155 */ 160 */
156 161
157static void __devinit smp_store_cpu_info(int id) 162static void __cpuinit smp_store_cpu_info(int id)
158{ 163{
159 struct cpuinfo_x86 *c = cpu_data + id; 164 struct cpuinfo_x86 *c = cpu_data + id;
160 165
@@ -222,7 +227,7 @@ static struct {
222 atomic_t count_start; 227 atomic_t count_start;
223 atomic_t count_stop; 228 atomic_t count_stop;
224 unsigned long long values[NR_CPUS]; 229 unsigned long long values[NR_CPUS];
225} tsc __initdata = { 230} tsc __cpuinitdata = {
226 .start_flag = ATOMIC_INIT(0), 231 .start_flag = ATOMIC_INIT(0),
227 .count_start = ATOMIC_INIT(0), 232 .count_start = ATOMIC_INIT(0),
228 .count_stop = ATOMIC_INIT(0), 233 .count_stop = ATOMIC_INIT(0),
@@ -327,7 +332,7 @@ static void __init synchronize_tsc_bp(void)
327 printk("passed.\n"); 332 printk("passed.\n");
328} 333}
329 334
330static void __init synchronize_tsc_ap(void) 335static void __cpuinit synchronize_tsc_ap(void)
331{ 336{
332 int i; 337 int i;
333 338
@@ -359,7 +364,7 @@ extern void calibrate_delay(void);
359 364
360static atomic_t init_deasserted; 365static atomic_t init_deasserted;
361 366
362static void __devinit smp_callin(void) 367static void __cpuinit smp_callin(void)
363{ 368{
364 int cpuid, phys_id; 369 int cpuid, phys_id;
365 unsigned long timeout; 370 unsigned long timeout;
@@ -533,14 +538,14 @@ set_cpu_sibling_map(int cpu)
533/* 538/*
534 * Activate a secondary processor. 539 * Activate a secondary processor.
535 */ 540 */
536static void __devinit start_secondary(void *unused) 541static void __cpuinit start_secondary(void *unused)
537{ 542{
538 /* 543 /*
539 * Dont put anything before smp_callin(), SMP 544 * Don't put *anything* before secondary_cpu_init(), SMP
540 * booting is too fragile that we want to limit the 545 * booting is too fragile that we want to limit the
541 * things done here to the most necessary things. 546 * things done here to the most necessary things.
542 */ 547 */
543 cpu_init(); 548 secondary_cpu_init();
544 preempt_disable(); 549 preempt_disable();
545 smp_callin(); 550 smp_callin();
546 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 551 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
@@ -599,13 +604,16 @@ void __devinit initialize_secondary(void)
599 "movl %0,%%esp\n\t" 604 "movl %0,%%esp\n\t"
600 "jmp *%1" 605 "jmp *%1"
601 : 606 :
602 :"r" (current->thread.esp),"r" (current->thread.eip)); 607 :"m" (current->thread.esp),"m" (current->thread.eip));
603} 608}
604 609
610/* Static state in head.S used to set up a CPU */
605extern struct { 611extern struct {
606 void * esp; 612 void * esp;
607 unsigned short ss; 613 unsigned short ss;
608} stack_start; 614} stack_start;
615extern struct i386_pda *start_pda;
616extern struct Xgt_desc_struct cpu_gdt_descr;
609 617
610#ifdef CONFIG_NUMA 618#ifdef CONFIG_NUMA
611 619
@@ -923,7 +931,7 @@ static inline struct task_struct * alloc_idle_task(int cpu)
923#define alloc_idle_task(cpu) fork_idle(cpu) 931#define alloc_idle_task(cpu) fork_idle(cpu)
924#endif 932#endif
925 933
926static int __devinit do_boot_cpu(int apicid, int cpu) 934static int __cpuinit do_boot_cpu(int apicid, int cpu)
927/* 935/*
928 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 936 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
929 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 937 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -936,9 +944,6 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
936 unsigned long start_eip; 944 unsigned long start_eip;
937 unsigned short nmi_high = 0, nmi_low = 0; 945 unsigned short nmi_high = 0, nmi_low = 0;
938 946
939 ++cpucount;
940 alternatives_smp_switch(1);
941
942 /* 947 /*
943 * We can't use kernel_thread since we must avoid to 948 * We can't use kernel_thread since we must avoid to
944 * reschedule the child. 949 * reschedule the child.
@@ -946,15 +951,30 @@ static int __devinit do_boot_cpu(int apicid, int cpu)
946 idle = alloc_idle_task(cpu); 951 idle = alloc_idle_task(cpu);
947 if (IS_ERR(idle)) 952 if (IS_ERR(idle))
948 panic("failed fork for CPU %d", cpu); 953 panic("failed fork for CPU %d", cpu);
954
955 /* Pre-allocate and initialize the CPU's GDT and PDA so it
956 doesn't have to do any memory allocation during the
957 delicate CPU-bringup phase. */
958 if (!init_gdt(cpu, idle)) {
959 printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
960 return -1; /* ? */
961 }
962
949 idle->thread.eip = (unsigned long) start_secondary; 963 idle->thread.eip = (unsigned long) start_secondary;
950 /* start_eip had better be page-aligned! */ 964 /* start_eip had better be page-aligned! */
951 start_eip = setup_trampoline(); 965 start_eip = setup_trampoline();
952 966
967 ++cpucount;
968 alternatives_smp_switch(1);
969
953 /* So we see what's up */ 970 /* So we see what's up */
954 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); 971 printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
955 /* Stack for startup_32 can be just as for start_secondary onwards */ 972 /* Stack for startup_32 can be just as for start_secondary onwards */
956 stack_start.esp = (void *) idle->thread.esp; 973 stack_start.esp = (void *) idle->thread.esp;
957 974
975 start_pda = cpu_pda(cpu);
976 cpu_gdt_descr = per_cpu(cpu_gdt_descr, cpu);
977
958 irq_ctx_init(cpu); 978 irq_ctx_init(cpu);
959 979
960 x86_cpu_to_apicid[cpu] = apicid; 980 x86_cpu_to_apicid[cpu] = apicid;
@@ -1049,13 +1069,15 @@ void cpu_exit_clear(void)
1049 1069
1050struct warm_boot_cpu_info { 1070struct warm_boot_cpu_info {
1051 struct completion *complete; 1071 struct completion *complete;
1072 struct work_struct task;
1052 int apicid; 1073 int apicid;
1053 int cpu; 1074 int cpu;
1054}; 1075};
1055 1076
1056static void __cpuinit do_warm_boot_cpu(void *p) 1077static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
1057{ 1078{
1058 struct warm_boot_cpu_info *info = p; 1079 struct warm_boot_cpu_info *info =
1080 container_of(work, struct warm_boot_cpu_info, task);
1059 do_boot_cpu(info->apicid, info->cpu); 1081 do_boot_cpu(info->apicid, info->cpu);
1060 complete(info->complete); 1082 complete(info->complete);
1061} 1083}
@@ -1064,7 +1086,6 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1064{ 1086{
1065 DECLARE_COMPLETION_ONSTACK(done); 1087 DECLARE_COMPLETION_ONSTACK(done);
1066 struct warm_boot_cpu_info info; 1088 struct warm_boot_cpu_info info;
1067 struct work_struct task;
1068 int apicid, ret; 1089 int apicid, ret;
1069 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); 1090 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1070 1091
@@ -1089,15 +1110,15 @@ static int __cpuinit __smp_prepare_cpu(int cpu)
1089 info.complete = &done; 1110 info.complete = &done;
1090 info.apicid = apicid; 1111 info.apicid = apicid;
1091 info.cpu = cpu; 1112 info.cpu = cpu;
1092 INIT_WORK(&task, do_warm_boot_cpu, &info); 1113 INIT_WORK(&info.task, do_warm_boot_cpu);
1093 1114
1094 tsc_sync_disabled = 1; 1115 tsc_sync_disabled = 1;
1095 1116
1096 /* init low mem mapping */ 1117 /* init low mem mapping */
1097 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, 1118 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
1098 KERNEL_PGD_PTRS); 1119 min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
1099 flush_tlb_all(); 1120 flush_tlb_all();
1100 schedule_work(&task); 1121 schedule_work(&info.task);
1101 wait_for_completion(&done); 1122 wait_for_completion(&done);
1102 1123
1103 tsc_sync_disabled = 0; 1124 tsc_sync_disabled = 0;
@@ -1108,34 +1129,15 @@ exit:
1108} 1129}
1109#endif 1130#endif
1110 1131
1111static void smp_tune_scheduling (void) 1132static void smp_tune_scheduling(void)
1112{ 1133{
1113 unsigned long cachesize; /* kB */ 1134 unsigned long cachesize; /* kB */
1114 unsigned long bandwidth = 350; /* MB/s */
1115 /*
1116 * Rough estimation for SMP scheduling, this is the number of
1117 * cycles it takes for a fully memory-limited process to flush
1118 * the SMP-local cache.
1119 *
1120 * (For a P5 this pretty much means we will choose another idle
1121 * CPU almost always at wakeup time (this is due to the small
1122 * L1 cache), on PIIs it's around 50-100 usecs, depending on
1123 * the cache size)
1124 */
1125 1135
1126 if (!cpu_khz) { 1136 if (cpu_khz) {
1127 /*
1128 * this basically disables processor-affinity
1129 * scheduling on SMP without a TSC.
1130 */
1131 return;
1132 } else {
1133 cachesize = boot_cpu_data.x86_cache_size; 1137 cachesize = boot_cpu_data.x86_cache_size;
1134 if (cachesize == -1) { 1138
1135 cachesize = 16; /* Pentiums, 2x8kB cache */ 1139 if (cachesize > 0)
1136 bandwidth = 100; 1140 max_cache_size = cachesize * 1024;
1137 }
1138 max_cache_size = cachesize * 1024;
1139 } 1141 }
1140} 1142}
1141 1143
@@ -1430,7 +1432,7 @@ void __cpu_die(unsigned int cpu)
1430} 1432}
1431#endif /* CONFIG_HOTPLUG_CPU */ 1433#endif /* CONFIG_HOTPLUG_CPU */
1432 1434
1433int __devinit __cpu_up(unsigned int cpu) 1435int __cpuinit __cpu_up(unsigned int cpu)
1434{ 1436{
1435#ifdef CONFIG_HOTPLUG_CPU 1437#ifdef CONFIG_HOTPLUG_CPU
1436 int ret=0; 1438 int ret=0;
@@ -1461,6 +1463,12 @@ int __devinit __cpu_up(unsigned int cpu)
1461 cpu_set(cpu, smp_commenced_mask); 1463 cpu_set(cpu, smp_commenced_mask);
1462 while (!cpu_isset(cpu, cpu_online_map)) 1464 while (!cpu_isset(cpu, cpu_online_map))
1463 cpu_relax(); 1465 cpu_relax();
1466
1467#ifdef CONFIG_X86_GENERICARCH
1468 if (num_online_cpus() > 8 && genapic == &apic_default)
1469 panic("Default flat APIC routing can't be used with > 8 cpus\n");
1470#endif
1471
1464 return 0; 1472 return 0;
1465} 1473}
1466 1474
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index 713ba39d32c6..7de9117b5a3a 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -27,7 +27,11 @@
27 * Should the kernel map a VDSO page into processes and pass its 27 * Should the kernel map a VDSO page into processes and pass its
28 * address down to glibc upon exec()? 28 * address down to glibc upon exec()?
29 */ 29 */
30#ifdef CONFIG_PARAVIRT
31unsigned int __read_mostly vdso_enabled = 0;
32#else
30unsigned int __read_mostly vdso_enabled = 1; 33unsigned int __read_mostly vdso_enabled = 1;
34#endif
31 35
32EXPORT_SYMBOL_GPL(vdso_enabled); 36EXPORT_SYMBOL_GPL(vdso_enabled);
33 37
@@ -132,7 +136,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
132 goto up_fail; 136 goto up_fail;
133 } 137 }
134 138
135 vma = kmem_cache_zalloc(vm_area_cachep, SLAB_KERNEL); 139 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
136 if (!vma) { 140 if (!vma) {
137 ret = -ENOMEM; 141 ret = -ENOMEM;
138 goto up_fail; 142 goto up_fail;
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 78af572fd17c..c505b16c0990 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -56,6 +56,7 @@
56#include <asm/uaccess.h> 56#include <asm/uaccess.h>
57#include <asm/processor.h> 57#include <asm/processor.h>
58#include <asm/timer.h> 58#include <asm/timer.h>
59#include <asm/time.h>
59 60
60#include "mach_time.h" 61#include "mach_time.h"
61 62
@@ -116,10 +117,7 @@ static int set_rtc_mmss(unsigned long nowtime)
116 /* gets recalled with irq locally disabled */ 117 /* gets recalled with irq locally disabled */
117 /* XXX - does irqsave resolve this? -johnstul */ 118 /* XXX - does irqsave resolve this? -johnstul */
118 spin_lock_irqsave(&rtc_lock, flags); 119 spin_lock_irqsave(&rtc_lock, flags);
119 if (efi_enabled) 120 retval = set_wallclock(nowtime);
120 retval = efi_set_rtc_mmss(nowtime);
121 else
122 retval = mach_set_rtc_mmss(nowtime);
123 spin_unlock_irqrestore(&rtc_lock, flags); 121 spin_unlock_irqrestore(&rtc_lock, flags);
124 122
125 return retval; 123 return retval;
@@ -223,10 +221,7 @@ unsigned long get_cmos_time(void)
223 221
224 spin_lock_irqsave(&rtc_lock, flags); 222 spin_lock_irqsave(&rtc_lock, flags);
225 223
226 if (efi_enabled) 224 retval = get_wallclock();
227 retval = efi_get_time();
228 else
229 retval = mach_get_cmos_time();
230 225
231 spin_unlock_irqrestore(&rtc_lock, flags); 226 spin_unlock_irqrestore(&rtc_lock, flags);
232 227
@@ -370,7 +365,7 @@ static void __init hpet_time_init(void)
370 printk("Using HPET for base-timer\n"); 365 printk("Using HPET for base-timer\n");
371 } 366 }
372 367
373 time_init_hook(); 368 do_time_init();
374} 369}
375#endif 370#endif
376 371
@@ -392,5 +387,5 @@ void __init time_init(void)
392 387
393 do_settimeofday(&ts); 388 do_settimeofday(&ts);
394 389
395 time_init_hook(); 390 do_time_init();
396} 391}
diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c
index 1a2a979cf6a3..1e4702dfcd01 100644
--- a/arch/i386/kernel/time_hpet.c
+++ b/arch/i386/kernel/time_hpet.c
@@ -132,14 +132,20 @@ int __init hpet_enable(void)
132 * the single HPET timer for system time. 132 * the single HPET timer for system time.
133 */ 133 */
134#ifdef CONFIG_HPET_EMULATE_RTC 134#ifdef CONFIG_HPET_EMULATE_RTC
135 if (!(id & HPET_ID_NUMBER)) 135 if (!(id & HPET_ID_NUMBER)) {
136 iounmap(hpet_virt_address);
137 hpet_virt_address = NULL;
136 return -1; 138 return -1;
139 }
137#endif 140#endif
138 141
139 142
140 hpet_period = hpet_readl(HPET_PERIOD); 143 hpet_period = hpet_readl(HPET_PERIOD);
141 if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) 144 if ((hpet_period < HPET_MIN_PERIOD) || (hpet_period > HPET_MAX_PERIOD)) {
145 iounmap(hpet_virt_address);
146 hpet_virt_address = NULL;
142 return -1; 147 return -1;
148 }
143 149
144 /* 150 /*
145 * 64 bit math 151 * 64 bit math
@@ -156,8 +162,11 @@ int __init hpet_enable(void)
156 162
157 hpet_use_timer = id & HPET_ID_LEGSUP; 163 hpet_use_timer = id & HPET_ID_LEGSUP;
158 164
159 if (hpet_timer_stop_set_go(hpet_tick)) 165 if (hpet_timer_stop_set_go(hpet_tick)) {
166 iounmap(hpet_virt_address);
167 hpet_virt_address = NULL;
160 return -1; 168 return -1;
169 }
161 170
162 use_hpet = 1; 171 use_hpet = 1;
163 172
diff --git a/arch/i386/kernel/topology.c b/arch/i386/kernel/topology.c
index 07d6da36a825..79cf608e14ca 100644
--- a/arch/i386/kernel/topology.c
+++ b/arch/i386/kernel/topology.c
@@ -40,14 +40,18 @@ int arch_register_cpu(int num)
40 * restrictions and assumptions in kernel. This basically 40 * restrictions and assumptions in kernel. This basically
41 * doesnt add a control file, one cannot attempt to offline 41 * doesnt add a control file, one cannot attempt to offline
42 * BSP. 42 * BSP.
43 *
44 * Also certain PCI quirks require not to enable hotplug control
45 * for all CPU's.
43 */ 46 */
44 if (!num) 47 if (num && enable_cpu_hotplug)
45 cpu_devices[num].cpu.no_control = 1; 48 cpu_devices[num].cpu.hotpluggable = 1;
46 49
47 return register_cpu(&cpu_devices[num].cpu, num); 50 return register_cpu(&cpu_devices[num].cpu, num);
48} 51}
49 52
50#ifdef CONFIG_HOTPLUG_CPU 53#ifdef CONFIG_HOTPLUG_CPU
54int enable_cpu_hotplug = 1;
51 55
52void arch_unregister_cpu(int num) { 56void arch_unregister_cpu(int num) {
53 return unregister_cpu(&cpu_devices[num].cpu); 57 return unregister_cpu(&cpu_devices[num].cpu);
diff --git a/arch/i386/kernel/trampoline.S b/arch/i386/kernel/trampoline.S
index fcce0e61b0e7..2f1814c5cfd7 100644
--- a/arch/i386/kernel/trampoline.S
+++ b/arch/i386/kernel/trampoline.S
@@ -38,6 +38,11 @@
38 38
39.data 39.data
40 40
41/* We can free up trampoline after bootup if cpu hotplug is not supported. */
42#ifndef CONFIG_HOTPLUG_CPU
43.section ".init.data","aw",@progbits
44#endif
45
41.code16 46.code16
42 47
43ENTRY(trampoline_data) 48ENTRY(trampoline_data)
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index fe9c5e8e7e6f..0efad8aeb41a 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -29,6 +29,8 @@
29#include <linux/kexec.h> 29#include <linux/kexec.h>
30#include <linux/unwind.h> 30#include <linux/unwind.h>
31#include <linux/uaccess.h> 31#include <linux/uaccess.h>
32#include <linux/nmi.h>
33#include <linux/bug.h>
32 34
33#ifdef CONFIG_EISA 35#ifdef CONFIG_EISA
34#include <linux/ioport.h> 36#include <linux/ioport.h>
@@ -61,9 +63,6 @@ int panic_on_unrecovered_nmi;
61 63
62asmlinkage int system_call(void); 64asmlinkage int system_call(void);
63 65
64struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
65 { 0, 0 }, { 0, 0 } };
66
67/* Do we ignore FPU interrupts ? */ 66/* Do we ignore FPU interrupts ? */
68char ignore_fpu_irq = 0; 67char ignore_fpu_irq = 0;
69 68
@@ -94,12 +93,7 @@ asmlinkage void alignment_check(void);
94asmlinkage void spurious_interrupt_bug(void); 93asmlinkage void spurious_interrupt_bug(void);
95asmlinkage void machine_check(void); 94asmlinkage void machine_check(void);
96 95
97static int kstack_depth_to_print = 24; 96int kstack_depth_to_print = 24;
98#ifdef CONFIG_STACK_UNWIND
99static int call_trace = 1;
100#else
101#define call_trace (-1)
102#endif
103ATOMIC_NOTIFIER_HEAD(i386die_chain); 97ATOMIC_NOTIFIER_HEAD(i386die_chain);
104 98
105int register_die_notifier(struct notifier_block *nb) 99int register_die_notifier(struct notifier_block *nb)
@@ -153,25 +147,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
153 return ebp; 147 return ebp;
154} 148}
155 149
156struct ops_and_data { 150#define MSG(msg) ops->warning(data, msg)
157 struct stacktrace_ops *ops;
158 void *data;
159};
160
161static asmlinkage int
162dump_trace_unwind(struct unwind_frame_info *info, void *data)
163{
164 struct ops_and_data *oad = (struct ops_and_data *)data;
165 int n = 0;
166
167 while (unwind(info) == 0 && UNW_PC(info)) {
168 n++;
169 oad->ops->address(oad->data, UNW_PC(info));
170 if (arch_unw_user_mode(info))
171 break;
172 }
173 return n;
174}
175 151
176void dump_trace(struct task_struct *task, struct pt_regs *regs, 152void dump_trace(struct task_struct *task, struct pt_regs *regs,
177 unsigned long *stack, 153 unsigned long *stack,
@@ -182,39 +158,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
182 if (!task) 158 if (!task)
183 task = current; 159 task = current;
184 160
185 if (call_trace >= 0) {
186 int unw_ret = 0;
187 struct unwind_frame_info info;
188 struct ops_and_data oad = { .ops = ops, .data = data };
189
190 if (regs) {
191 if (unwind_init_frame_info(&info, task, regs) == 0)
192 unw_ret = dump_trace_unwind(&info, &oad);
193 } else if (task == current)
194 unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad);
195 else {
196 if (unwind_init_blocked(&info, task) == 0)
197 unw_ret = dump_trace_unwind(&info, &oad);
198 }
199 if (unw_ret > 0) {
200 if (call_trace == 1 && !arch_unw_user_mode(&info)) {
201 ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n",
202 UNW_PC(&info));
203 if (UNW_SP(&info) >= PAGE_OFFSET) {
204 ops->warning(data, "Leftover inexact backtrace:\n");
205 stack = (void *)UNW_SP(&info);
206 if (!stack)
207 return;
208 ebp = UNW_FP(&info);
209 } else
210 ops->warning(data, "Full inexact backtrace again:\n");
211 } else if (call_trace >= 1)
212 return;
213 else
214 ops->warning(data, "Full inexact backtrace again:\n");
215 } else
216 ops->warning(data, "Inexact backtrace:\n");
217 }
218 if (!stack) { 161 if (!stack) {
219 unsigned long dummy; 162 unsigned long dummy;
220 stack = &dummy; 163 stack = &dummy;
@@ -247,6 +190,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
247 stack = (unsigned long*)context->previous_esp; 190 stack = (unsigned long*)context->previous_esp;
248 if (!stack) 191 if (!stack)
249 break; 192 break;
193 touch_nmi_watchdog();
250 } 194 }
251} 195}
252EXPORT_SYMBOL(dump_trace); 196EXPORT_SYMBOL(dump_trace);
@@ -379,7 +323,7 @@ void show_registers(struct pt_regs *regs)
379 * time of the fault.. 323 * time of the fault..
380 */ 324 */
381 if (in_kernel) { 325 if (in_kernel) {
382 u8 __user *eip; 326 u8 *eip;
383 int code_bytes = 64; 327 int code_bytes = 64;
384 unsigned char c; 328 unsigned char c;
385 329
@@ -388,18 +332,20 @@ void show_registers(struct pt_regs *regs)
388 332
389 printk(KERN_EMERG "Code: "); 333 printk(KERN_EMERG "Code: ");
390 334
391 eip = (u8 __user *)regs->eip - 43; 335 eip = (u8 *)regs->eip - 43;
392 if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { 336 if (eip < (u8 *)PAGE_OFFSET ||
337 probe_kernel_address(eip, c)) {
393 /* try starting at EIP */ 338 /* try starting at EIP */
394 eip = (u8 __user *)regs->eip; 339 eip = (u8 *)regs->eip;
395 code_bytes = 32; 340 code_bytes = 32;
396 } 341 }
397 for (i = 0; i < code_bytes; i++, eip++) { 342 for (i = 0; i < code_bytes; i++, eip++) {
398 if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { 343 if (eip < (u8 *)PAGE_OFFSET ||
344 probe_kernel_address(eip, c)) {
399 printk(" Bad EIP value."); 345 printk(" Bad EIP value.");
400 break; 346 break;
401 } 347 }
402 if (eip == (u8 __user *)regs->eip) 348 if (eip == (u8 *)regs->eip)
403 printk("<%02x> ", c); 349 printk("<%02x> ", c);
404 else 350 else
405 printk("%02x ", c); 351 printk("%02x ", c);
@@ -408,43 +354,22 @@ void show_registers(struct pt_regs *regs)
408 printk("\n"); 354 printk("\n");
409} 355}
410 356
411static void handle_BUG(struct pt_regs *regs) 357int is_valid_bugaddr(unsigned long eip)
412{ 358{
413 unsigned long eip = regs->eip;
414 unsigned short ud2; 359 unsigned short ud2;
415 360
416 if (eip < PAGE_OFFSET) 361 if (eip < PAGE_OFFSET)
417 return; 362 return 0;
418 if (probe_kernel_address((unsigned short __user *)eip, ud2)) 363 if (probe_kernel_address((unsigned short *)eip, ud2))
419 return; 364 return 0;
420 if (ud2 != 0x0b0f)
421 return;
422
423 printk(KERN_EMERG "------------[ cut here ]------------\n");
424 365
425#ifdef CONFIG_DEBUG_BUGVERBOSE 366 return ud2 == 0x0b0f;
426 do {
427 unsigned short line;
428 char *file;
429 char c;
430
431 if (probe_kernel_address((unsigned short __user *)(eip + 2),
432 line))
433 break;
434 if (__get_user(file, (char * __user *)(eip + 4)) ||
435 (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
436 file = "<bad filename>";
437
438 printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
439 return;
440 } while (0);
441#endif
442 printk(KERN_EMERG "Kernel BUG at [verbose debug info unavailable]\n");
443} 367}
444 368
445/* This is gone through when something in the kernel 369/*
446 * has done something bad and is about to be terminated. 370 * This is gone through when something in the kernel has done something bad and
447*/ 371 * is about to be terminated.
372 */
448void die(const char * str, struct pt_regs * regs, long err) 373void die(const char * str, struct pt_regs * regs, long err)
449{ 374{
450 static struct { 375 static struct {
@@ -452,7 +377,7 @@ void die(const char * str, struct pt_regs * regs, long err)
452 u32 lock_owner; 377 u32 lock_owner;
453 int lock_owner_depth; 378 int lock_owner_depth;
454 } die = { 379 } die = {
455 .lock = SPIN_LOCK_UNLOCKED, 380 .lock = __SPIN_LOCK_UNLOCKED(die.lock),
456 .lock_owner = -1, 381 .lock_owner = -1,
457 .lock_owner_depth = 0 382 .lock_owner_depth = 0
458 }; 383 };
@@ -476,7 +401,8 @@ void die(const char * str, struct pt_regs * regs, long err)
476 unsigned long esp; 401 unsigned long esp;
477 unsigned short ss; 402 unsigned short ss;
478 403
479 handle_BUG(regs); 404 report_bug(regs->eip);
405
480 printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); 406 printk(KERN_EMERG "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
481#ifdef CONFIG_PREEMPT 407#ifdef CONFIG_PREEMPT
482 printk(KERN_EMERG "PREEMPT "); 408 printk(KERN_EMERG "PREEMPT ");
@@ -707,8 +633,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
707{ 633{
708 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on " 634 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
709 "CPU %d.\n", reason, smp_processor_id()); 635 "CPU %d.\n", reason, smp_processor_id());
710 printk(KERN_EMERG "You probably have a hardware problem with your RAM " 636 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
711 "chips\n");
712 if (panic_on_unrecovered_nmi) 637 if (panic_on_unrecovered_nmi)
713 panic("NMI: Not continuing"); 638 panic("NMI: Not continuing");
714 639
@@ -773,7 +698,6 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
773 printk(" on CPU%d, eip %08lx, registers:\n", 698 printk(" on CPU%d, eip %08lx, registers:\n",
774 smp_processor_id(), regs->eip); 699 smp_processor_id(), regs->eip);
775 show_registers(regs); 700 show_registers(regs);
776 printk(KERN_EMERG "console shuts up ...\n");
777 console_silent(); 701 console_silent();
778 spin_unlock(&nmi_print_lock); 702 spin_unlock(&nmi_print_lock);
779 bust_spinlocks(0); 703 bust_spinlocks(0);
@@ -1088,49 +1012,24 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
1088#endif 1012#endif
1089} 1013}
1090 1014
1091fastcall void setup_x86_bogus_stack(unsigned char * stk) 1015fastcall unsigned long patch_espfix_desc(unsigned long uesp,
1016 unsigned long kesp)
1092{ 1017{
1093 unsigned long *switch16_ptr, *switch32_ptr;
1094 struct pt_regs *regs;
1095 unsigned long stack_top, stack_bot;
1096 unsigned short iret_frame16_off;
1097 int cpu = smp_processor_id(); 1018 int cpu = smp_processor_id();
1098 /* reserve the space on 32bit stack for the magic switch16 pointer */ 1019 struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
1099 memmove(stk, stk + 8, sizeof(struct pt_regs)); 1020 struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
1100 switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs)); 1021 unsigned long base = (kesp - uesp) & -THREAD_SIZE;
1101 regs = (struct pt_regs *)stk; 1022 unsigned long new_kesp = kesp - base;
1102 /* now the switch32 on 16bit stack */ 1023 unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
1103 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu); 1024 __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
1104 stack_top = stack_bot + CPU_16BIT_STACK_SIZE; 1025 /* Set up base for espfix segment */
1105 switch32_ptr = (unsigned long *)(stack_top - 8); 1026 desc &= 0x00f0ff0000000000ULL;
1106 iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20; 1027 desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
1107 /* copy iret frame on 16bit stack */ 1028 ((((__u64)base) << 32) & 0xff00000000000000ULL) |
1108 memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20); 1029 ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
1109 /* fill in the switch pointers */ 1030 (lim_pages & 0xffff);
1110 switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off; 1031 *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
1111 switch16_ptr[1] = __ESPFIX_SS; 1032 return new_kesp;
1112 switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
1113 8 - CPU_16BIT_STACK_SIZE;
1114 switch32_ptr[1] = __KERNEL_DS;
1115}
1116
1117fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
1118{
1119 unsigned long *switch32_ptr;
1120 unsigned char *stack16, *stack32;
1121 unsigned long stack_top, stack_bot;
1122 int len;
1123 int cpu = smp_processor_id();
1124 stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
1125 stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
1126 switch32_ptr = (unsigned long *)(stack_top - 8);
1127 /* copy the data from 16bit stack to 32bit stack */
1128 len = CPU_16BIT_STACK_SIZE - 8 - sp;
1129 stack16 = (unsigned char *)(stack_bot + sp);
1130 stack32 = (unsigned char *)
1131 (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
1132 memcpy(stack32, stack16, len);
1133 return stack32;
1134} 1033}
1135 1034
1136/* 1035/*
@@ -1143,7 +1042,7 @@ fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
1143 * Must be called with kernel preemption disabled (in this case, 1042 * Must be called with kernel preemption disabled (in this case,
1144 * local interrupts are disabled at the call-site in entry.S). 1043 * local interrupts are disabled at the call-site in entry.S).
1145 */ 1044 */
1146asmlinkage void math_state_restore(struct pt_regs regs) 1045asmlinkage void math_state_restore(void)
1147{ 1046{
1148 struct thread_info *thread = current_thread_info(); 1047 struct thread_info *thread = current_thread_info();
1149 struct task_struct *tsk = thread->task; 1048 struct task_struct *tsk = thread->task;
@@ -1153,6 +1052,7 @@ asmlinkage void math_state_restore(struct pt_regs regs)
1153 init_fpu(tsk); 1052 init_fpu(tsk);
1154 restore_fpu(tsk); 1053 restore_fpu(tsk);
1155 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 1054 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1055 tsk->fpu_counter++;
1156} 1056}
1157 1057
1158#ifndef CONFIG_MATH_EMULATION 1058#ifndef CONFIG_MATH_EMULATION
@@ -1291,19 +1191,3 @@ static int __init kstack_setup(char *s)
1291 return 1; 1191 return 1;
1292} 1192}
1293__setup("kstack=", kstack_setup); 1193__setup("kstack=", kstack_setup);
1294
1295#ifdef CONFIG_STACK_UNWIND
1296static int __init call_trace_setup(char *s)
1297{
1298 if (strcmp(s, "old") == 0)
1299 call_trace = -1;
1300 else if (strcmp(s, "both") == 0)
1301 call_trace = 0;
1302 else if (strcmp(s, "newfallback") == 0)
1303 call_trace = 1;
1304 else if (strcmp(s, "new") == 2)
1305 call_trace = 2;
1306 return 1;
1307}
1308__setup("call_trace=", call_trace_setup);
1309#endif
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index fbc95828cd74..2cfc7b09b925 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -13,7 +13,6 @@
13 13
14#include <asm/delay.h> 14#include <asm/delay.h>
15#include <asm/tsc.h> 15#include <asm/tsc.h>
16#include <asm/delay.h>
17#include <asm/io.h> 16#include <asm/io.h>
18 17
19#include "mach_timer.h" 18#include "mach_timer.h"
@@ -25,7 +24,7 @@
25 */ 24 */
26unsigned int tsc_khz; 25unsigned int tsc_khz;
27 26
28int tsc_disable __cpuinitdata = 0; 27int tsc_disable;
29 28
30#ifdef CONFIG_X86_TSC 29#ifdef CONFIG_X86_TSC
31static int __init tsc_setup(char *str) 30static int __init tsc_setup(char *str)
@@ -217,7 +216,7 @@ static unsigned int cpufreq_delayed_issched = 0;
217static unsigned int cpufreq_init = 0; 216static unsigned int cpufreq_init = 0;
218static struct work_struct cpufreq_delayed_get_work; 217static struct work_struct cpufreq_delayed_get_work;
219 218
220static void handle_cpufreq_delayed_get(void *v) 219static void handle_cpufreq_delayed_get(struct work_struct *work)
221{ 220{
222 unsigned int cpu; 221 unsigned int cpu;
223 222
@@ -306,7 +305,7 @@ static int __init cpufreq_tsc(void)
306{ 305{
307 int ret; 306 int ret;
308 307
309 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); 308 INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get);
310 ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, 309 ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
311 CPUFREQ_TRANSITION_NOTIFIER); 310 CPUFREQ_TRANSITION_NOTIFIER);
312 if (!ret) 311 if (!ret)
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index cbcd61d6120b..be2f96e67f78 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -43,6 +43,7 @@
43#include <linux/highmem.h> 43#include <linux/highmem.h>
44#include <linux/ptrace.h> 44#include <linux/ptrace.h>
45#include <linux/audit.h> 45#include <linux/audit.h>
46#include <linux/stddef.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48#include <asm/io.h> 49#include <asm/io.h>
@@ -72,10 +73,10 @@
72/* 73/*
73 * 8- and 16-bit register defines.. 74 * 8- and 16-bit register defines..
74 */ 75 */
75#define AL(regs) (((unsigned char *)&((regs)->eax))[0]) 76#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0])
76#define AH(regs) (((unsigned char *)&((regs)->eax))[1]) 77#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1])
77#define IP(regs) (*(unsigned short *)&((regs)->eip)) 78#define IP(regs) (*(unsigned short *)&((regs)->pt.eip))
78#define SP(regs) (*(unsigned short *)&((regs)->esp)) 79#define SP(regs) (*(unsigned short *)&((regs)->pt.esp))
79 80
80/* 81/*
81 * virtual flags (16 and 32-bit versions) 82 * virtual flags (16 and 32-bit versions)
@@ -89,10 +90,37 @@
89#define SAFE_MASK (0xDD5) 90#define SAFE_MASK (0xDD5)
90#define RETURN_MASK (0xDFF) 91#define RETURN_MASK (0xDFF)
91 92
92#define VM86_REGS_PART2 orig_eax 93/* convert kernel_vm86_regs to vm86_regs */
93#define VM86_REGS_SIZE1 \ 94static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
94 ( (unsigned)( & (((struct kernel_vm86_regs *)0)->VM86_REGS_PART2) ) ) 95 const struct kernel_vm86_regs *regs)
95#define VM86_REGS_SIZE2 (sizeof(struct kernel_vm86_regs) - VM86_REGS_SIZE1) 96{
97 int ret = 0;
98
99 /* kernel_vm86_regs is missing xfs, so copy everything up to
100 (but not including) xgs, and then rest after xgs. */
101 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs));
102 ret += copy_to_user(&user->__null_gs, &regs->pt.xgs,
103 sizeof(struct kernel_vm86_regs) -
104 offsetof(struct kernel_vm86_regs, pt.xgs));
105
106 return ret;
107}
108
109/* convert vm86_regs to kernel_vm86_regs */
110static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
111 const struct vm86_regs __user *user,
112 unsigned extra)
113{
114 int ret = 0;
115
116 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs));
117 ret += copy_from_user(&regs->pt.xgs, &user->__null_gs,
118 sizeof(struct kernel_vm86_regs) -
119 offsetof(struct kernel_vm86_regs, pt.xgs) +
120 extra);
121
122 return ret;
123}
96 124
97struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); 125struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs));
98struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) 126struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
@@ -112,10 +140,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
112 printk("no vm86_info: BAD\n"); 140 printk("no vm86_info: BAD\n");
113 do_exit(SIGSEGV); 141 do_exit(SIGSEGV);
114 } 142 }
115 set_flags(regs->eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); 143 set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask);
116 tmp = copy_to_user(&current->thread.vm86_info->regs,regs, VM86_REGS_SIZE1); 144 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
117 tmp += copy_to_user(&current->thread.vm86_info->regs.VM86_REGS_PART2,
118 &regs->VM86_REGS_PART2, VM86_REGS_SIZE2);
119 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap); 145 tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
120 if (tmp) { 146 if (tmp) {
121 printk("vm86: could not access userspace vm86_info\n"); 147 printk("vm86: could not access userspace vm86_info\n");
@@ -129,9 +155,11 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
129 current->thread.saved_esp0 = 0; 155 current->thread.saved_esp0 = 0;
130 put_cpu(); 156 put_cpu();
131 157
132 loadsegment(fs, current->thread.saved_fs);
133 loadsegment(gs, current->thread.saved_gs);
134 ret = KVM86->regs32; 158 ret = KVM86->regs32;
159
160 loadsegment(fs, current->thread.saved_fs);
161 ret->xgs = current->thread.saved_gs;
162
135 return ret; 163 return ret;
136} 164}
137 165
@@ -183,9 +211,9 @@ asmlinkage int sys_vm86old(struct pt_regs regs)
183 tsk = current; 211 tsk = current;
184 if (tsk->thread.saved_esp0) 212 if (tsk->thread.saved_esp0)
185 goto out; 213 goto out;
186 tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); 214 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
187 tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, 215 offsetof(struct kernel_vm86_struct, vm86plus) -
188 (long)&info.vm86plus - (long)&info.regs.VM86_REGS_PART2); 216 sizeof(info.regs));
189 ret = -EFAULT; 217 ret = -EFAULT;
190 if (tmp) 218 if (tmp)
191 goto out; 219 goto out;
@@ -233,9 +261,9 @@ asmlinkage int sys_vm86(struct pt_regs regs)
233 if (tsk->thread.saved_esp0) 261 if (tsk->thread.saved_esp0)
234 goto out; 262 goto out;
235 v86 = (struct vm86plus_struct __user *)regs.ecx; 263 v86 = (struct vm86plus_struct __user *)regs.ecx;
236 tmp = copy_from_user(&info, v86, VM86_REGS_SIZE1); 264 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
237 tmp += copy_from_user(&info.regs.VM86_REGS_PART2, &v86->regs.VM86_REGS_PART2, 265 offsetof(struct kernel_vm86_struct, regs32) -
238 (long)&info.regs32 - (long)&info.regs.VM86_REGS_PART2); 266 sizeof(info.regs));
239 ret = -EFAULT; 267 ret = -EFAULT;
240 if (tmp) 268 if (tmp)
241 goto out; 269 goto out;
@@ -252,15 +280,15 @@ out:
252static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) 280static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
253{ 281{
254 struct tss_struct *tss; 282 struct tss_struct *tss;
255 long eax;
256/* 283/*
257 * make sure the vm86() system call doesn't try to do anything silly 284 * make sure the vm86() system call doesn't try to do anything silly
258 */ 285 */
259 info->regs.__null_ds = 0; 286 info->regs.pt.xds = 0;
260 info->regs.__null_es = 0; 287 info->regs.pt.xes = 0;
288 info->regs.pt.xgs = 0;
261 289
262/* we are clearing fs,gs later just before "jmp resume_userspace", 290/* we are clearing fs later just before "jmp resume_userspace",
263 * because starting with Linux 2.1.x they aren't no longer saved/restored 291 * because it is not saved/restored.
264 */ 292 */
265 293
266/* 294/*
@@ -268,10 +296,10 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
268 * has set it up safely, so this makes sure interrupt etc flags are 296 * has set it up safely, so this makes sure interrupt etc flags are
269 * inherited from protected mode. 297 * inherited from protected mode.
270 */ 298 */
271 VEFLAGS = info->regs.eflags; 299 VEFLAGS = info->regs.pt.eflags;
272 info->regs.eflags &= SAFE_MASK; 300 info->regs.pt.eflags &= SAFE_MASK;
273 info->regs.eflags |= info->regs32->eflags & ~SAFE_MASK; 301 info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK;
274 info->regs.eflags |= VM_MASK; 302 info->regs.pt.eflags |= VM_MASK;
275 303
276 switch (info->cpu_type) { 304 switch (info->cpu_type) {
277 case CPU_286: 305 case CPU_286:
@@ -294,7 +322,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
294 info->regs32->eax = 0; 322 info->regs32->eax = 0;
295 tsk->thread.saved_esp0 = tsk->thread.esp0; 323 tsk->thread.saved_esp0 = tsk->thread.esp0;
296 savesegment(fs, tsk->thread.saved_fs); 324 savesegment(fs, tsk->thread.saved_fs);
297 savesegment(gs, tsk->thread.saved_gs); 325 tsk->thread.saved_gs = info->regs32->xgs;
298 326
299 tss = &per_cpu(init_tss, get_cpu()); 327 tss = &per_cpu(init_tss, get_cpu());
300 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; 328 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -306,19 +334,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
306 tsk->thread.screen_bitmap = info->screen_bitmap; 334 tsk->thread.screen_bitmap = info->screen_bitmap;
307 if (info->flags & VM86_SCREEN_BITMAP) 335 if (info->flags & VM86_SCREEN_BITMAP)
308 mark_screen_rdonly(tsk->mm); 336 mark_screen_rdonly(tsk->mm);
309 __asm__ __volatile__("xorl %eax,%eax; movl %eax,%fs; movl %eax,%gs\n\t");
310 __asm__ __volatile__("movl %%eax, %0\n" :"=r"(eax));
311 337
312 /*call audit_syscall_exit since we do not exit via the normal paths */ 338 /*call audit_syscall_exit since we do not exit via the normal paths */
313 if (unlikely(current->audit_context)) 339 if (unlikely(current->audit_context))
314 audit_syscall_exit(AUDITSC_RESULT(eax), eax); 340 audit_syscall_exit(AUDITSC_RESULT(0), 0);
315 341
316 __asm__ __volatile__( 342 __asm__ __volatile__(
317 "movl %0,%%esp\n\t" 343 "movl %0,%%esp\n\t"
318 "movl %1,%%ebp\n\t" 344 "movl %1,%%ebp\n\t"
345 "mov %2, %%fs\n\t"
319 "jmp resume_userspace" 346 "jmp resume_userspace"
320 : /* no outputs */ 347 : /* no outputs */
321 :"r" (&info->regs), "r" (task_thread_info(tsk))); 348 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
322 /* we never return here */ 349 /* we never return here */
323} 350}
324 351
@@ -348,12 +375,12 @@ static inline void clear_IF(struct kernel_vm86_regs * regs)
348 375
349static inline void clear_TF(struct kernel_vm86_regs * regs) 376static inline void clear_TF(struct kernel_vm86_regs * regs)
350{ 377{
351 regs->eflags &= ~TF_MASK; 378 regs->pt.eflags &= ~TF_MASK;
352} 379}
353 380
354static inline void clear_AC(struct kernel_vm86_regs * regs) 381static inline void clear_AC(struct kernel_vm86_regs * regs)
355{ 382{
356 regs->eflags &= ~AC_MASK; 383 regs->pt.eflags &= ~AC_MASK;
357} 384}
358 385
359/* It is correct to call set_IF(regs) from the set_vflags_* 386/* It is correct to call set_IF(regs) from the set_vflags_*
@@ -370,7 +397,7 @@ static inline void clear_AC(struct kernel_vm86_regs * regs)
370static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) 397static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs)
371{ 398{
372 set_flags(VEFLAGS, eflags, current->thread.v86mask); 399 set_flags(VEFLAGS, eflags, current->thread.v86mask);
373 set_flags(regs->eflags, eflags, SAFE_MASK); 400 set_flags(regs->pt.eflags, eflags, SAFE_MASK);
374 if (eflags & IF_MASK) 401 if (eflags & IF_MASK)
375 set_IF(regs); 402 set_IF(regs);
376 else 403 else
@@ -380,7 +407,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs
380static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) 407static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
381{ 408{
382 set_flags(VFLAGS, flags, current->thread.v86mask); 409 set_flags(VFLAGS, flags, current->thread.v86mask);
383 set_flags(regs->eflags, flags, SAFE_MASK); 410 set_flags(regs->pt.eflags, flags, SAFE_MASK);
384 if (flags & IF_MASK) 411 if (flags & IF_MASK)
385 set_IF(regs); 412 set_IF(regs);
386 else 413 else
@@ -389,7 +416,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg
389 416
390static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) 417static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
391{ 418{
392 unsigned long flags = regs->eflags & RETURN_MASK; 419 unsigned long flags = regs->pt.eflags & RETURN_MASK;
393 420
394 if (VEFLAGS & VIF_MASK) 421 if (VEFLAGS & VIF_MASK)
395 flags |= IF_MASK; 422 flags |= IF_MASK;
@@ -493,7 +520,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
493 unsigned long __user *intr_ptr; 520 unsigned long __user *intr_ptr;
494 unsigned long segoffs; 521 unsigned long segoffs;
495 522
496 if (regs->cs == BIOSSEG) 523 if (regs->pt.xcs == BIOSSEG)
497 goto cannot_handle; 524 goto cannot_handle;
498 if (is_revectored(i, &KVM86->int_revectored)) 525 if (is_revectored(i, &KVM86->int_revectored))
499 goto cannot_handle; 526 goto cannot_handle;
@@ -505,9 +532,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
505 if ((segoffs >> 16) == BIOSSEG) 532 if ((segoffs >> 16) == BIOSSEG)
506 goto cannot_handle; 533 goto cannot_handle;
507 pushw(ssp, sp, get_vflags(regs), cannot_handle); 534 pushw(ssp, sp, get_vflags(regs), cannot_handle);
508 pushw(ssp, sp, regs->cs, cannot_handle); 535 pushw(ssp, sp, regs->pt.xcs, cannot_handle);
509 pushw(ssp, sp, IP(regs), cannot_handle); 536 pushw(ssp, sp, IP(regs), cannot_handle);
510 regs->cs = segoffs >> 16; 537 regs->pt.xcs = segoffs >> 16;
511 SP(regs) -= 6; 538 SP(regs) -= 6;
512 IP(regs) = segoffs & 0xffff; 539 IP(regs) = segoffs & 0xffff;
513 clear_TF(regs); 540 clear_TF(regs);
@@ -524,7 +551,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno
524 if (VMPI.is_vm86pus) { 551 if (VMPI.is_vm86pus) {
525 if ( (trapno==3) || (trapno==1) ) 552 if ( (trapno==3) || (trapno==1) )
526 return_to_32bit(regs, VM86_TRAP + (trapno << 8)); 553 return_to_32bit(regs, VM86_TRAP + (trapno << 8));
527 do_int(regs, trapno, (unsigned char __user *) (regs->ss << 4), SP(regs)); 554 do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs));
528 return 0; 555 return 0;
529 } 556 }
530 if (trapno !=1) 557 if (trapno !=1)
@@ -560,10 +587,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
560 handle_vm86_trap(regs, 0, 1); \ 587 handle_vm86_trap(regs, 0, 1); \
561 return; } while (0) 588 return; } while (0)
562 589
563 orig_flags = *(unsigned short *)&regs->eflags; 590 orig_flags = *(unsigned short *)&regs->pt.eflags;
564 591
565 csp = (unsigned char __user *) (regs->cs << 4); 592 csp = (unsigned char __user *) (regs->pt.xcs << 4);
566 ssp = (unsigned char __user *) (regs->ss << 4); 593 ssp = (unsigned char __user *) (regs->pt.xss << 4);
567 sp = SP(regs); 594 sp = SP(regs);
568 ip = IP(regs); 595 ip = IP(regs);
569 596
@@ -650,7 +677,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
650 SP(regs) += 6; 677 SP(regs) += 6;
651 } 678 }
652 IP(regs) = newip; 679 IP(regs) = newip;
653 regs->cs = newcs; 680 regs->pt.xcs = newcs;
654 CHECK_IF_IN_TRAP; 681 CHECK_IF_IN_TRAP;
655 if (data32) { 682 if (data32) {
656 set_vflags_long(newflags, regs); 683 set_vflags_long(newflags, regs);
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index c6f84a0322ba..a53c8b1854b5 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -1,18 +1,32 @@
1/* ld script to make i386 Linux kernel 1/* ld script to make i386 Linux kernel
2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; 2 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
3 *
4 * Don't define absolute symbols until and unless you know that symbol
5 * value is should remain constant even if kernel image is relocated
6 * at run time. Absolute symbols are not relocated. If symbol value should
7 * change if kernel is relocated, make the symbol section relative and
8 * put it inside the section definition.
3 */ 9 */
4 10
11/* Don't define absolute symbols until and unless you know that symbol
12 * value is should remain constant even if kernel image is relocated
13 * at run time. Absolute symbols are not relocated. If symbol value should
14 * change if kernel is relocated, make the symbol section relative and
15 * put it inside the section definition.
16 */
5#define LOAD_OFFSET __PAGE_OFFSET 17#define LOAD_OFFSET __PAGE_OFFSET
6 18
7#include <asm-generic/vmlinux.lds.h> 19#include <asm-generic/vmlinux.lds.h>
8#include <asm/thread_info.h> 20#include <asm/thread_info.h>
9#include <asm/page.h> 21#include <asm/page.h>
10#include <asm/cache.h> 22#include <asm/cache.h>
23#include <asm/boot.h>
11 24
12OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") 25OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
13OUTPUT_ARCH(i386) 26OUTPUT_ARCH(i386)
14ENTRY(phys_startup_32) 27ENTRY(phys_startup_32)
15jiffies = jiffies_64; 28jiffies = jiffies_64;
29_proxy_pda = 0;
16 30
17PHDRS { 31PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */ 32 text PT_LOAD FLAGS(5); /* R_E */
@@ -21,34 +35,37 @@ PHDRS {
21} 35}
22SECTIONS 36SECTIONS
23{ 37{
24 . = __KERNEL_START; 38 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
25 phys_startup_32 = startup_32 - LOAD_OFFSET; 39 phys_startup_32 = startup_32 - LOAD_OFFSET;
26 /* read-only */ 40 /* read-only */
27 _text = .; /* Text and read-only data */
28 .text : AT(ADDR(.text) - LOAD_OFFSET) { 41 .text : AT(ADDR(.text) - LOAD_OFFSET) {
42 _text = .; /* Text and read-only data */
29 *(.text) 43 *(.text)
30 SCHED_TEXT 44 SCHED_TEXT
31 LOCK_TEXT 45 LOCK_TEXT
32 KPROBES_TEXT 46 KPROBES_TEXT
33 *(.fixup) 47 *(.fixup)
34 *(.gnu.warning) 48 *(.gnu.warning)
35 } :text = 0x9090 49 _etext = .; /* End of text section */
36 50 } :text = 0x9090
37 _etext = .; /* End of text section */
38 51
39 . = ALIGN(16); /* Exception table */ 52 . = ALIGN(16); /* Exception table */
40 __start___ex_table = .; 53 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
41 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } 54 __start___ex_table = .;
42 __stop___ex_table = .; 55 *(__ex_table)
56 __stop___ex_table = .;
57 }
43 58
44 RODATA 59 RODATA
45 60
61 BUG_TABLE
62
46 . = ALIGN(4); 63 . = ALIGN(4);
47 __tracedata_start = .;
48 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { 64 .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
65 __tracedata_start = .;
49 *(.tracedata) 66 *(.tracedata)
67 __tracedata_end = .;
50 } 68 }
51 __tracedata_end = .;
52 69
53 /* writeable */ 70 /* writeable */
54 . = ALIGN(4096); 71 . = ALIGN(4096);
@@ -57,11 +74,19 @@ SECTIONS
57 CONSTRUCTORS 74 CONSTRUCTORS
58 } :data 75 } :data
59 76
77 .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) {
78 __start_paravirtprobe = .;
79 *(.paravirtprobe)
80 __stop_paravirtprobe = .;
81 }
82
60 . = ALIGN(4096); 83 . = ALIGN(4096);
61 __nosave_begin = .; 84 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
62 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } 85 __nosave_begin = .;
63 . = ALIGN(4096); 86 *(.data.nosave)
64 __nosave_end = .; 87 . = ALIGN(4096);
88 __nosave_end = .;
89 }
65 90
66 . = ALIGN(4096); 91 . = ALIGN(4096);
67 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { 92 .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
@@ -75,17 +100,10 @@ SECTIONS
75 100
76 /* rarely changed data like cpu maps */ 101 /* rarely changed data like cpu maps */
77 . = ALIGN(32); 102 . = ALIGN(32);
78 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } 103 .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
79 _edata = .; /* End of data section */ 104 *(.data.read_mostly)
80 105 _edata = .; /* End of data section */
81#ifdef CONFIG_STACK_UNWIND
82 . = ALIGN(4);
83 .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) {
84 __start_unwind = .;
85 *(.eh_frame)
86 __end_unwind = .;
87 } 106 }
88#endif
89 107
90 . = ALIGN(THREAD_SIZE); /* init_task */ 108 . = ALIGN(THREAD_SIZE); /* init_task */
91 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { 109 .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
@@ -94,88 +112,102 @@ SECTIONS
94 112
95 /* might get freed after init */ 113 /* might get freed after init */
96 . = ALIGN(4096); 114 . = ALIGN(4096);
97 __smp_alt_begin = .;
98 __smp_alt_instructions = .;
99 .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { 115 .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) {
116 __smp_alt_begin = .;
117 __smp_alt_instructions = .;
100 *(.smp_altinstructions) 118 *(.smp_altinstructions)
119 __smp_alt_instructions_end = .;
101 } 120 }
102 __smp_alt_instructions_end = .;
103 . = ALIGN(4); 121 . = ALIGN(4);
104 __smp_locks = .;
105 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 122 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
123 __smp_locks = .;
106 *(.smp_locks) 124 *(.smp_locks)
125 __smp_locks_end = .;
107 } 126 }
108 __smp_locks_end = .;
109 .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { 127 .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) {
110 *(.smp_altinstr_replacement) 128 *(.smp_altinstr_replacement)
129 __smp_alt_end = .;
111 } 130 }
131 /* will be freed after init
132 * Following ALIGN() is required to make sure no other data falls on the
133 * same page where __smp_alt_end is pointing as that page might be freed
134 * after boot. Always make sure that ALIGN() directive is present after
135 * the section which contains __smp_alt_end.
136 */
112 . = ALIGN(4096); 137 . = ALIGN(4096);
113 __smp_alt_end = .;
114 138
115 /* will be freed after init */ 139 /* will be freed after init */
116 . = ALIGN(4096); /* Init code and data */ 140 . = ALIGN(4096); /* Init code and data */
117 __init_begin = .;
118 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 141 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
142 __init_begin = .;
119 _sinittext = .; 143 _sinittext = .;
120 *(.init.text) 144 *(.init.text)
121 _einittext = .; 145 _einittext = .;
122 } 146 }
123 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } 147 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
124 . = ALIGN(16); 148 . = ALIGN(16);
125 __setup_start = .; 149 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
126 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } 150 __setup_start = .;
127 __setup_end = .; 151 *(.init.setup)
128 __initcall_start = .; 152 __setup_end = .;
153 }
129 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { 154 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
155 __initcall_start = .;
130 INITCALLS 156 INITCALLS
157 __initcall_end = .;
131 } 158 }
132 __initcall_end = .;
133 __con_initcall_start = .;
134 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { 159 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
160 __con_initcall_start = .;
135 *(.con_initcall.init) 161 *(.con_initcall.init)
162 __con_initcall_end = .;
136 } 163 }
137 __con_initcall_end = .;
138 SECURITY_INIT 164 SECURITY_INIT
139 . = ALIGN(4); 165 . = ALIGN(4);
140 __alt_instructions = .;
141 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 166 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
167 __alt_instructions = .;
142 *(.altinstructions) 168 *(.altinstructions)
169 __alt_instructions_end = .;
143 } 170 }
144 __alt_instructions_end = .;
145 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 171 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
146 *(.altinstr_replacement) 172 *(.altinstr_replacement)
147 } 173 }
174 . = ALIGN(4);
175 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
176 __start_parainstructions = .;
177 *(.parainstructions)
178 __stop_parainstructions = .;
179 }
148 /* .exit.text is discard at runtime, not link time, to deal with references 180 /* .exit.text is discard at runtime, not link time, to deal with references
149 from .altinstructions and .eh_frame */ 181 from .altinstructions and .eh_frame */
150 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } 182 .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
151 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } 183 .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
152 . = ALIGN(4096); 184 . = ALIGN(4096);
153 __initramfs_start = .; 185 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
154 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } 186 __initramfs_start = .;
155 __initramfs_end = .; 187 *(.init.ramfs)
188 __initramfs_end = .;
189 }
156 . = ALIGN(L1_CACHE_BYTES); 190 . = ALIGN(L1_CACHE_BYTES);
157 __per_cpu_start = .; 191 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
158 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } 192 __per_cpu_start = .;
159 __per_cpu_end = .; 193 *(.data.percpu)
194 __per_cpu_end = .;
195 }
160 . = ALIGN(4096); 196 . = ALIGN(4096);
161 __init_end = .;
162 /* freed after init ends here */ 197 /* freed after init ends here */
163 198
164 __bss_start = .; /* BSS */
165 .bss.page_aligned : AT(ADDR(.bss.page_aligned) - LOAD_OFFSET) {
166 *(.bss.page_aligned)
167 }
168 .bss : AT(ADDR(.bss) - LOAD_OFFSET) { 199 .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
200 __init_end = .;
201 __bss_start = .; /* BSS */
202 *(.bss.page_aligned)
169 *(.bss) 203 *(.bss)
204 . = ALIGN(4);
205 __bss_stop = .;
206 _end = . ;
207 /* This is where the kernel creates the early boot page tables */
208 . = ALIGN(4096);
209 pg0 = . ;
170 } 210 }
171 . = ALIGN(4);
172 __bss_stop = .;
173
174 _end = . ;
175
176 /* This is where the kernel creates the early boot page tables */
177 . = ALIGN(4096);
178 pg0 = .;
179 211
180 /* Sections to be discarded */ 212 /* Sections to be discarded */
181 /DISCARD/ : { 213 /DISCARD/ : {