aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-02-14 12:46:06 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-02-14 12:46:06 -0500
commit414f827c46973ba39320cfb43feb55a0eeb9b4e8 (patch)
tree45e860974ef698e71370a0ebdddcff4f14fbdf9e /arch/i386/kernel
parent86a71dbd3e81e8870d0f0e56b87875f57e58222b (diff)
parent126b1922367fbe5513daa675a2abd13ed3917f4e (diff)
Merge branch 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6
* 'for-linus' of git://one.firstfloor.org/home/andi/git/linux-2.6: (94 commits) [PATCH] x86-64: Remove mk_pte_phys() [PATCH] i386: Fix broken CONFIG_COMPAT_VDSO on i386 [PATCH] i386: fix 32-bit ioctls on x64_32 [PATCH] x86: Unify pcspeaker platform device code between i386/x86-64 [PATCH] i386: Remove extern declaration from mm/discontig.c, put in header. [PATCH] i386: Rename cpu_gdt_descr and remove extern declaration from smpboot.c [PATCH] i386: Move mce_disabled to asm/mce.h [PATCH] i386: paravirt unhandled fallthrough [PATCH] x86_64: Wire up compat epoll_pwait [PATCH] x86: Don't require the vDSO for handling a.out signals [PATCH] i386: Fix Cyrix MediaGX detection [PATCH] i386: Fix warning in cpu initialization [PATCH] i386: Fix warning in microcode.c [PATCH] x86: Enable NMI watchdog for AMD Family 0x10 CPUs [PATCH] x86: Add new CPUID bits for AMD Family 10 CPUs in /proc/cpuinfo [PATCH] i386: Remove fastcall in paravirt.[ch] [PATCH] x86-64: Fix wrong gcc check in bitops.h [PATCH] x86-64: survive having no irq mapping for a vector [PATCH] i386: geode configuration fixes [PATCH] i386: add option to show more code in oops reports ...
Diffstat (limited to 'arch/i386/kernel')
-rw-r--r--arch/i386/kernel/Makefile3
-rw-r--r--arch/i386/kernel/apic.c6
-rw-r--r--arch/i386/kernel/apm.c26
-rw-r--r--arch/i386/kernel/asm-offsets.c2
-rw-r--r--arch/i386/kernel/cpu/common.c14
-rw-r--r--arch/i386/kernel/cpu/cyrix.c52
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.c1
-rw-r--r--arch/i386/kernel/cpu/mcheck/mce.h2
-rw-r--r--arch/i386/kernel/cpu/mcheck/p4.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/if.c30
-rw-r--r--arch/i386/kernel/cpu/mtrr/main.c6
-rw-r--r--arch/i386/kernel/cpu/mtrr/mtrr.h2
-rw-r--r--arch/i386/kernel/cpu/proc.c14
-rw-r--r--arch/i386/kernel/cpu/transmeta.c5
-rw-r--r--arch/i386/kernel/cpuid.c7
-rw-r--r--arch/i386/kernel/e820.c18
-rw-r--r--arch/i386/kernel/entry.S78
-rw-r--r--arch/i386/kernel/head.S38
-rw-r--r--arch/i386/kernel/io_apic.c4
-rw-r--r--arch/i386/kernel/irq.c3
-rw-r--r--arch/i386/kernel/kprobes.c6
-rw-r--r--arch/i386/kernel/microcode.c2
-rw-r--r--arch/i386/kernel/msr.c13
-rw-r--r--arch/i386/kernel/nmi.c98
-rw-r--r--arch/i386/kernel/paravirt.c116
-rw-r--r--arch/i386/kernel/pcspeaker.c20
-rw-r--r--arch/i386/kernel/process.c99
-rw-r--r--arch/i386/kernel/ptrace.c16
-rw-r--r--arch/i386/kernel/setup.c35
-rw-r--r--arch/i386/kernel/signal.c16
-rw-r--r--arch/i386/kernel/smp.c7
-rw-r--r--arch/i386/kernel/smpboot.c16
-rw-r--r--arch/i386/kernel/sysenter.c2
-rw-r--r--arch/i386/kernel/time.c14
-rw-r--r--arch/i386/kernel/traps.c27
-rw-r--r--arch/i386/kernel/tsc.c26
-rw-r--r--arch/i386/kernel/vm86.c33
-rw-r--r--arch/i386/kernel/vmi.c949
-rw-r--r--arch/i386/kernel/vmitime.c499
-rw-r--r--arch/i386/kernel/vmlinux.lds.S7
40 files changed, 2029 insertions, 285 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 1e8988e558c5..cbe4e601885c 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -40,8 +40,9 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
40obj-$(CONFIG_HPET_TIMER) += hpet.o 40obj-$(CONFIG_HPET_TIMER) += hpet.o
41obj-$(CONFIG_K8_NB) += k8.o 41obj-$(CONFIG_K8_NB) += k8.o
42 42
43# Make sure this is linked after any other paravirt_ops structs: see head.S 43obj-$(CONFIG_VMI) += vmi.o vmitime.o
44obj-$(CONFIG_PARAVIRT) += paravirt.o 44obj-$(CONFIG_PARAVIRT) += paravirt.o
45obj-y += pcspeaker.o
45 46
46EXTRA_AFLAGS := -traditional 47EXTRA_AFLAGS := -traditional
47 48
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 776d9be26af9..f4159e0a7ae9 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -36,6 +36,7 @@
36#include <asm/hpet.h> 36#include <asm/hpet.h>
37#include <asm/i8253.h> 37#include <asm/i8253.h>
38#include <asm/nmi.h> 38#include <asm/nmi.h>
39#include <asm/idle.h>
39 40
40#include <mach_apic.h> 41#include <mach_apic.h>
41#include <mach_apicdef.h> 42#include <mach_apicdef.h>
@@ -1255,6 +1256,7 @@ fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
1255 * Besides, if we don't timer interrupts ignore the global 1256 * Besides, if we don't timer interrupts ignore the global
1256 * interrupt lock, which is the WrongThing (tm) to do. 1257 * interrupt lock, which is the WrongThing (tm) to do.
1257 */ 1258 */
1259 exit_idle();
1258 irq_enter(); 1260 irq_enter();
1259 smp_local_timer_interrupt(); 1261 smp_local_timer_interrupt();
1260 irq_exit(); 1262 irq_exit();
@@ -1305,6 +1307,7 @@ fastcall void smp_spurious_interrupt(struct pt_regs *regs)
1305{ 1307{
1306 unsigned long v; 1308 unsigned long v;
1307 1309
1310 exit_idle();
1308 irq_enter(); 1311 irq_enter();
1309 /* 1312 /*
1310 * Check if this really is a spurious interrupt and ACK it 1313 * Check if this really is a spurious interrupt and ACK it
@@ -1329,6 +1332,7 @@ fastcall void smp_error_interrupt(struct pt_regs *regs)
1329{ 1332{
1330 unsigned long v, v1; 1333 unsigned long v, v1;
1331 1334
1335 exit_idle();
1332 irq_enter(); 1336 irq_enter();
1333 /* First tickle the hardware, only then report what went on. -- REW */ 1337 /* First tickle the hardware, only then report what went on. -- REW */
1334 v = apic_read(APIC_ESR); 1338 v = apic_read(APIC_ESR);
@@ -1395,7 +1399,7 @@ int __init APIC_init_uniprocessor (void)
1395 if (!skip_ioapic_setup && nr_ioapics) 1399 if (!skip_ioapic_setup && nr_ioapics)
1396 setup_IO_APIC(); 1400 setup_IO_APIC();
1397#endif 1401#endif
1398 setup_boot_APIC_clock(); 1402 setup_boot_clock();
1399 1403
1400 return 0; 1404 return 0;
1401} 1405}
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index db99a8948dae..f9ba0af7ee1f 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -211,6 +211,7 @@
211#include <linux/slab.h> 211#include <linux/slab.h>
212#include <linux/stat.h> 212#include <linux/stat.h>
213#include <linux/proc_fs.h> 213#include <linux/proc_fs.h>
214#include <linux/seq_file.h>
214#include <linux/miscdevice.h> 215#include <linux/miscdevice.h>
215#include <linux/apm_bios.h> 216#include <linux/apm_bios.h>
216#include <linux/init.h> 217#include <linux/init.h>
@@ -1636,9 +1637,8 @@ static int do_open(struct inode * inode, struct file * filp)
1636 return 0; 1637 return 0;
1637} 1638}
1638 1639
1639static int apm_get_info(char *buf, char **start, off_t fpos, int length) 1640static int proc_apm_show(struct seq_file *m, void *v)
1640{ 1641{
1641 char * p;
1642 unsigned short bx; 1642 unsigned short bx;
1643 unsigned short cx; 1643 unsigned short cx;
1644 unsigned short dx; 1644 unsigned short dx;
@@ -1650,8 +1650,6 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
1650 int time_units = -1; 1650 int time_units = -1;
1651 char *units = "?"; 1651 char *units = "?";
1652 1652
1653 p = buf;
1654
1655 if ((num_online_cpus() == 1) && 1653 if ((num_online_cpus() == 1) &&
1656 !(error = apm_get_power_status(&bx, &cx, &dx))) { 1654 !(error = apm_get_power_status(&bx, &cx, &dx))) {
1657 ac_line_status = (bx >> 8) & 0xff; 1655 ac_line_status = (bx >> 8) & 0xff;
@@ -1705,7 +1703,7 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
1705 -1: Unknown 1703 -1: Unknown
1706 8) min = minutes; sec = seconds */ 1704 8) min = minutes; sec = seconds */
1707 1705
1708 p += sprintf(p, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", 1706 seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n",
1709 driver_version, 1707 driver_version,
1710 (apm_info.bios.version >> 8) & 0xff, 1708 (apm_info.bios.version >> 8) & 0xff,
1711 apm_info.bios.version & 0xff, 1709 apm_info.bios.version & 0xff,
@@ -1716,10 +1714,22 @@ static int apm_get_info(char *buf, char **start, off_t fpos, int length)
1716 percentage, 1714 percentage,
1717 time_units, 1715 time_units,
1718 units); 1716 units);
1717 return 0;
1718}
1719 1719
1720 return p - buf; 1720static int proc_apm_open(struct inode *inode, struct file *file)
1721{
1722 return single_open(file, proc_apm_show, NULL);
1721} 1723}
1722 1724
1725static const struct file_operations apm_file_ops = {
1726 .owner = THIS_MODULE,
1727 .open = proc_apm_open,
1728 .read = seq_read,
1729 .llseek = seq_lseek,
1730 .release = single_release,
1731};
1732
1723static int apm(void *unused) 1733static int apm(void *unused)
1724{ 1734{
1725 unsigned short bx; 1735 unsigned short bx;
@@ -2341,9 +2351,9 @@ static int __init apm_init(void)
2341 set_base(gdt[APM_DS >> 3], 2351 set_base(gdt[APM_DS >> 3],
2342 __va((unsigned long)apm_info.bios.dseg << 4)); 2352 __va((unsigned long)apm_info.bios.dseg << 4));
2343 2353
2344 apm_proc = create_proc_info_entry("apm", 0, NULL, apm_get_info); 2354 apm_proc = create_proc_entry("apm", 0, NULL);
2345 if (apm_proc) 2355 if (apm_proc)
2346 apm_proc->owner = THIS_MODULE; 2356 apm_proc->proc_fops = &apm_file_ops;
2347 2357
2348 kapmd_task = kthread_create(apm, NULL, "kapmd"); 2358 kapmd_task = kthread_create(apm, NULL, "kapmd");
2349 if (IS_ERR(kapmd_task)) { 2359 if (IS_ERR(kapmd_task)) {
diff --git a/arch/i386/kernel/asm-offsets.c b/arch/i386/kernel/asm-offsets.c
index 1b2f3cd33270..c37535163bfc 100644
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -72,7 +72,7 @@ void foo(void)
72 OFFSET(PT_EAX, pt_regs, eax); 72 OFFSET(PT_EAX, pt_regs, eax);
73 OFFSET(PT_DS, pt_regs, xds); 73 OFFSET(PT_DS, pt_regs, xds);
74 OFFSET(PT_ES, pt_regs, xes); 74 OFFSET(PT_ES, pt_regs, xes);
75 OFFSET(PT_GS, pt_regs, xgs); 75 OFFSET(PT_FS, pt_regs, xfs);
76 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); 76 OFFSET(PT_ORIG_EAX, pt_regs, orig_eax);
77 OFFSET(PT_EIP, pt_regs, eip); 77 OFFSET(PT_EIP, pt_regs, eip);
78 OFFSET(PT_CS, pt_regs, xcs); 78 OFFSET(PT_CS, pt_regs, xcs);
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 8a8bbdaaf38a..dcbbd0a8bfc2 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -605,7 +605,7 @@ void __init early_cpu_init(void)
605struct pt_regs * __devinit idle_regs(struct pt_regs *regs) 605struct pt_regs * __devinit idle_regs(struct pt_regs *regs)
606{ 606{
607 memset(regs, 0, sizeof(struct pt_regs)); 607 memset(regs, 0, sizeof(struct pt_regs));
608 regs->xgs = __KERNEL_PDA; 608 regs->xfs = __KERNEL_PDA;
609 return regs; 609 return regs;
610} 610}
611 611
@@ -662,12 +662,12 @@ struct i386_pda boot_pda = {
662 .pcurrent = &init_task, 662 .pcurrent = &init_task,
663}; 663};
664 664
665static inline void set_kernel_gs(void) 665static inline void set_kernel_fs(void)
666{ 666{
667 /* Set %gs for this CPU's PDA. Memory clobber is to create a 667 /* Set %fs for this CPU's PDA. Memory clobber is to create a
668 barrier with respect to any PDA operations, so the compiler 668 barrier with respect to any PDA operations, so the compiler
669 doesn't move any before here. */ 669 doesn't move any before here. */
670 asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); 670 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
671} 671}
672 672
673/* Initialize the CPU's GDT and PDA. The boot CPU does this for 673/* Initialize the CPU's GDT and PDA. The boot CPU does this for
@@ -718,7 +718,7 @@ void __cpuinit cpu_set_gdt(int cpu)
718 the boot CPU, this will transition from the boot gdt+pda to 718 the boot CPU, this will transition from the boot gdt+pda to
719 the real ones). */ 719 the real ones). */
720 load_gdt(cpu_gdt_descr); 720 load_gdt(cpu_gdt_descr);
721 set_kernel_gs(); 721 set_kernel_fs();
722} 722}
723 723
724/* Common CPU init for both boot and secondary CPUs */ 724/* Common CPU init for both boot and secondary CPUs */
@@ -764,8 +764,8 @@ static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
764 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); 764 __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
765#endif 765#endif
766 766
767 /* Clear %fs. */ 767 /* Clear %gs. */
768 asm volatile ("mov %0, %%fs" : : "r" (0)); 768 asm volatile ("mov %0, %%gs" : : "r" (0));
769 769
770 /* Clear all 6 debug registers: */ 770 /* Clear all 6 debug registers: */
771 set_debugreg(0, 0); 771 set_debugreg(0, 0);
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index c0c3b59de32c..de27bd07bc9c 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -6,6 +6,7 @@
6#include <asm/io.h> 6#include <asm/io.h>
7#include <asm/processor.h> 7#include <asm/processor.h>
8#include <asm/timer.h> 8#include <asm/timer.h>
9#include <asm/pci-direct.h>
9 10
10#include "cpu.h" 11#include "cpu.h"
11 12
@@ -161,19 +162,19 @@ static void __cpuinit set_cx86_inc(void)
161static void __cpuinit geode_configure(void) 162static void __cpuinit geode_configure(void)
162{ 163{
163 unsigned long flags; 164 unsigned long flags;
164 u8 ccr3, ccr4; 165 u8 ccr3;
165 local_irq_save(flags); 166 local_irq_save(flags);
166 167
167 /* Suspend on halt power saving and enable #SUSP pin */ 168 /* Suspend on halt power saving and enable #SUSP pin */
168 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); 169 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
169 170
170 ccr3 = getCx86(CX86_CCR3); 171 ccr3 = getCx86(CX86_CCR3);
171 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* Enable */ 172 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
172
173 ccr4 = getCx86(CX86_CCR4);
174 ccr4 |= 0x38; /* FPU fast, DTE cache, Mem bypass */
175 173
176 setCx86(CX86_CCR3, ccr3); 174
175 /* FPU fast, DTE cache, Mem bypass */
176 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
177 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
177 178
178 set_cx86_memwb(); 179 set_cx86_memwb();
179 set_cx86_reorder(); 180 set_cx86_reorder();
@@ -183,14 +184,6 @@ static void __cpuinit geode_configure(void)
183} 184}
184 185
185 186
186#ifdef CONFIG_PCI
187static struct pci_device_id __cpuinitdata cyrix_55x0[] = {
188 { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510) },
189 { PCI_DEVICE(PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520) },
190 { },
191};
192#endif
193
194static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) 187static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
195{ 188{
196 unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0; 189 unsigned char dir0, dir0_msn, dir0_lsn, dir1 = 0;
@@ -258,6 +251,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
258 251
259 case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ 252 case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */
260#ifdef CONFIG_PCI 253#ifdef CONFIG_PCI
254 {
255 u32 vendor, device;
261 /* It isn't really a PCI quirk directly, but the cure is the 256 /* It isn't really a PCI quirk directly, but the cure is the
262 same. The MediaGX has deep magic SMM stuff that handles the 257 same. The MediaGX has deep magic SMM stuff that handles the
263 SB emulation. It thows away the fifo on disable_dma() which 258 SB emulation. It thows away the fifo on disable_dma() which
@@ -273,22 +268,34 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
273 printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n"); 268 printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
274 isa_dma_bridge_buggy = 2; 269 isa_dma_bridge_buggy = 2;
275 270
271 /* We do this before the PCI layer is running. However we
272 are safe here as we know the bridge must be a Cyrix
273 companion and must be present */
274 vendor = read_pci_config_16(0, 0, 0x12, PCI_VENDOR_ID);
275 device = read_pci_config_16(0, 0, 0x12, PCI_DEVICE_ID);
276 276
277 /* 277 /*
278 * The 5510/5520 companion chips have a funky PIT. 278 * The 5510/5520 companion chips have a funky PIT.
279 */ 279 */
280 if (pci_dev_present(cyrix_55x0)) 280 if (vendor == PCI_VENDOR_ID_CYRIX &&
281 (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
281 pit_latch_buggy = 1; 282 pit_latch_buggy = 1;
283 }
282#endif 284#endif
283 c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ 285 c->x86_cache_size=16; /* Yep 16K integrated cache thats it */
284 286
285 /* GXm supports extended cpuid levels 'ala' AMD */ 287 /* GXm supports extended cpuid levels 'ala' AMD */
286 if (c->cpuid_level == 2) { 288 if (c->cpuid_level == 2) {
287 /* Enable cxMMX extensions (GX1 Datasheet 54) */ 289 /* Enable cxMMX extensions (GX1 Datasheet 54) */
288 setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); 290 setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
289 291
290 /* GXlv/GXm/GX1 */ 292 /*
291 if((dir1 >= 0x50 && dir1 <= 0x54) || dir1 >= 0x63) 293 * GXm : 0x30 ... 0x5f GXm datasheet 51
294 * GXlv: 0x6x GXlv datasheet 54
295 * ? : 0x7x
296 * GX1 : 0x8x GX1 datasheet 56
297 */
298 if((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <=dir1 && dir1 <= 0x8f))
292 geode_configure(); 299 geode_configure();
293 get_model_name(c); /* get CPU marketing name */ 300 get_model_name(c); /* get CPU marketing name */
294 return; 301 return;
@@ -415,15 +422,14 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c)
415 422
416 if (dir0 == 5 || dir0 == 3) 423 if (dir0 == 5 || dir0 == 3)
417 { 424 {
418 unsigned char ccr3, ccr4; 425 unsigned char ccr3;
419 unsigned long flags; 426 unsigned long flags;
420 printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n"); 427 printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
421 local_irq_save(flags); 428 local_irq_save(flags);
422 ccr3 = getCx86(CX86_CCR3); 429 ccr3 = getCx86(CX86_CCR3);
423 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 430 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
424 ccr4 = getCx86(CX86_CCR4); 431 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */
425 setCx86(CX86_CCR4, ccr4 | 0x80); /* enable cpuid */ 432 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
426 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
427 local_irq_restore(flags); 433 local_irq_restore(flags);
428 } 434 }
429 } 435 }
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index d555bec0db99..4f10c62d180c 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -12,6 +12,7 @@
12 12
13#include <asm/processor.h> 13#include <asm/processor.h>
14#include <asm/system.h> 14#include <asm/system.h>
15#include <asm/mce.h>
15 16
16#include "mce.h" 17#include "mce.h"
17 18
diff --git a/arch/i386/kernel/cpu/mcheck/mce.h b/arch/i386/kernel/cpu/mcheck/mce.h
index 84fd4cf7d0fb..81fb6e2d35f3 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.h
+++ b/arch/i386/kernel/cpu/mcheck/mce.h
@@ -1,4 +1,5 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <asm/mce.h>
2 3
3void amd_mcheck_init(struct cpuinfo_x86 *c); 4void amd_mcheck_init(struct cpuinfo_x86 *c);
4void intel_p4_mcheck_init(struct cpuinfo_x86 *c); 5void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
@@ -9,6 +10,5 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c);
9/* Call the installed machine check handler for this CPU setup. */ 10/* Call the installed machine check handler for this CPU setup. */
10extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); 11extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code);
11 12
12extern int mce_disabled;
13extern int nr_mce_banks; 13extern int nr_mce_banks;
14 14
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
index 504434a46011..8359c19d3a23 100644
--- a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -12,6 +12,7 @@
12#include <asm/system.h> 12#include <asm/system.h>
13#include <asm/msr.h> 13#include <asm/msr.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
15#include <asm/idle.h>
15 16
16#include <asm/therm_throt.h> 17#include <asm/therm_throt.h>
17 18
@@ -59,6 +60,7 @@ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_therm
59 60
60fastcall void smp_thermal_interrupt(struct pt_regs *regs) 61fastcall void smp_thermal_interrupt(struct pt_regs *regs)
61{ 62{
63 exit_idle();
62 irq_enter(); 64 irq_enter();
63 vendor_thermal_interrupt(regs); 65 vendor_thermal_interrupt(regs);
64 irq_exit(); 66 irq_exit();
diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
index ee771f305f96..c7d8f1756745 100644
--- a/arch/i386/kernel/cpu/mtrr/if.c
+++ b/arch/i386/kernel/cpu/mtrr/if.c
@@ -211,6 +211,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
211 default: 211 default:
212 return -ENOTTY; 212 return -ENOTTY;
213 case MTRRIOC_ADD_ENTRY: 213 case MTRRIOC_ADD_ENTRY:
214#ifdef CONFIG_COMPAT
215 case MTRRIOC32_ADD_ENTRY:
216#endif
214 if (!capable(CAP_SYS_ADMIN)) 217 if (!capable(CAP_SYS_ADMIN))
215 return -EPERM; 218 return -EPERM;
216 err = 219 err =
@@ -218,21 +221,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
218 file, 0); 221 file, 0);
219 break; 222 break;
220 case MTRRIOC_SET_ENTRY: 223 case MTRRIOC_SET_ENTRY:
224#ifdef CONFIG_COMPAT
225 case MTRRIOC32_SET_ENTRY:
226#endif
221 if (!capable(CAP_SYS_ADMIN)) 227 if (!capable(CAP_SYS_ADMIN))
222 return -EPERM; 228 return -EPERM;
223 err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); 229 err = mtrr_add(sentry.base, sentry.size, sentry.type, 0);
224 break; 230 break;
225 case MTRRIOC_DEL_ENTRY: 231 case MTRRIOC_DEL_ENTRY:
232#ifdef CONFIG_COMPAT
233 case MTRRIOC32_DEL_ENTRY:
234#endif
226 if (!capable(CAP_SYS_ADMIN)) 235 if (!capable(CAP_SYS_ADMIN))
227 return -EPERM; 236 return -EPERM;
228 err = mtrr_file_del(sentry.base, sentry.size, file, 0); 237 err = mtrr_file_del(sentry.base, sentry.size, file, 0);
229 break; 238 break;
230 case MTRRIOC_KILL_ENTRY: 239 case MTRRIOC_KILL_ENTRY:
240#ifdef CONFIG_COMPAT
241 case MTRRIOC32_KILL_ENTRY:
242#endif
231 if (!capable(CAP_SYS_ADMIN)) 243 if (!capable(CAP_SYS_ADMIN))
232 return -EPERM; 244 return -EPERM;
233 err = mtrr_del(-1, sentry.base, sentry.size); 245 err = mtrr_del(-1, sentry.base, sentry.size);
234 break; 246 break;
235 case MTRRIOC_GET_ENTRY: 247 case MTRRIOC_GET_ENTRY:
248#ifdef CONFIG_COMPAT
249 case MTRRIOC32_GET_ENTRY:
250#endif
236 if (gentry.regnum >= num_var_ranges) 251 if (gentry.regnum >= num_var_ranges)
237 return -EINVAL; 252 return -EINVAL;
238 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); 253 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
@@ -249,6 +264,9 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
249 264
250 break; 265 break;
251 case MTRRIOC_ADD_PAGE_ENTRY: 266 case MTRRIOC_ADD_PAGE_ENTRY:
267#ifdef CONFIG_COMPAT
268 case MTRRIOC32_ADD_PAGE_ENTRY:
269#endif
252 if (!capable(CAP_SYS_ADMIN)) 270 if (!capable(CAP_SYS_ADMIN))
253 return -EPERM; 271 return -EPERM;
254 err = 272 err =
@@ -256,21 +274,33 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
256 file, 1); 274 file, 1);
257 break; 275 break;
258 case MTRRIOC_SET_PAGE_ENTRY: 276 case MTRRIOC_SET_PAGE_ENTRY:
277#ifdef CONFIG_COMPAT
278 case MTRRIOC32_SET_PAGE_ENTRY:
279#endif
259 if (!capable(CAP_SYS_ADMIN)) 280 if (!capable(CAP_SYS_ADMIN))
260 return -EPERM; 281 return -EPERM;
261 err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); 282 err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0);
262 break; 283 break;
263 case MTRRIOC_DEL_PAGE_ENTRY: 284 case MTRRIOC_DEL_PAGE_ENTRY:
285#ifdef CONFIG_COMPAT
286 case MTRRIOC32_DEL_PAGE_ENTRY:
287#endif
264 if (!capable(CAP_SYS_ADMIN)) 288 if (!capable(CAP_SYS_ADMIN))
265 return -EPERM; 289 return -EPERM;
266 err = mtrr_file_del(sentry.base, sentry.size, file, 1); 290 err = mtrr_file_del(sentry.base, sentry.size, file, 1);
267 break; 291 break;
268 case MTRRIOC_KILL_PAGE_ENTRY: 292 case MTRRIOC_KILL_PAGE_ENTRY:
293#ifdef CONFIG_COMPAT
294 case MTRRIOC32_KILL_PAGE_ENTRY:
295#endif
269 if (!capable(CAP_SYS_ADMIN)) 296 if (!capable(CAP_SYS_ADMIN))
270 return -EPERM; 297 return -EPERM;
271 err = mtrr_del_page(-1, sentry.base, sentry.size); 298 err = mtrr_del_page(-1, sentry.base, sentry.size);
272 break; 299 break;
273 case MTRRIOC_GET_PAGE_ENTRY: 300 case MTRRIOC_GET_PAGE_ENTRY:
301#ifdef CONFIG_COMPAT
302 case MTRRIOC32_GET_PAGE_ENTRY:
303#endif
274 if (gentry.regnum >= num_var_ranges) 304 if (gentry.regnum >= num_var_ranges)
275 return -EINVAL; 305 return -EINVAL;
276 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type); 306 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 16bb7ea87145..0acfb6a5a220 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -50,7 +50,7 @@ u32 num_var_ranges = 0;
50unsigned int *usage_table; 50unsigned int *usage_table;
51static DEFINE_MUTEX(mtrr_mutex); 51static DEFINE_MUTEX(mtrr_mutex);
52 52
53u32 size_or_mask, size_and_mask; 53u64 size_or_mask, size_and_mask;
54 54
55static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; 55static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {};
56 56
@@ -662,8 +662,8 @@ void __init mtrr_bp_init(void)
662 boot_cpu_data.x86_mask == 0x4)) 662 boot_cpu_data.x86_mask == 0x4))
663 phys_addr = 36; 663 phys_addr = 36;
664 664
665 size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1); 665 size_or_mask = ~((1ULL << (phys_addr - PAGE_SHIFT)) - 1);
666 size_and_mask = ~size_or_mask & 0xfff00000; 666 size_and_mask = ~size_or_mask & 0xfffff00000ULL;
667 } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR && 667 } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
668 boot_cpu_data.x86 == 6) { 668 boot_cpu_data.x86 == 6) {
669 /* VIA C* family have Intel style MTRRs, but 669 /* VIA C* family have Intel style MTRRs, but
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index d61ea9db6cfe..289dfe6030e3 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -84,7 +84,7 @@ void get_mtrr_state(void);
84 84
85extern void set_mtrr_ops(struct mtrr_ops * ops); 85extern void set_mtrr_ops(struct mtrr_ops * ops);
86 86
87extern u32 size_or_mask, size_and_mask; 87extern u64 size_or_mask, size_and_mask;
88extern struct mtrr_ops * mtrr_if; 88extern struct mtrr_ops * mtrr_if;
89 89
90#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) 90#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c
index 6624d8583c42..47e3ebbfb28d 100644
--- a/arch/i386/kernel/cpu/proc.c
+++ b/arch/i386/kernel/cpu/proc.c
@@ -29,7 +29,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
29 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 29 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
30 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, 30 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
31 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, 31 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
32 NULL, "fxsr_opt", "rdtscp", NULL, NULL, "lm", "3dnowext", "3dnow", 32 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", "3dnowext", "3dnow",
33 33
34 /* Transmeta-defined */ 34 /* Transmeta-defined */
35 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, 35 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
@@ -47,7 +47,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
47 /* Intel-defined (#2) */ 47 /* Intel-defined (#2) */
48 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", 48 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
49 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, 49 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
50 NULL, NULL, "dca", NULL, NULL, NULL, NULL, NULL, 50 NULL, NULL, "dca", NULL, NULL, NULL, NULL, "popcnt",
51 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 51 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
52 52
53 /* VIA/Cyrix/Centaur-defined */ 53 /* VIA/Cyrix/Centaur-defined */
@@ -57,8 +57,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
57 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 57 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
58 58
59 /* AMD-defined (#2) */ 59 /* AMD-defined (#2) */
60 "lahf_lm", "cmp_legacy", "svm", NULL, "cr8legacy", NULL, NULL, NULL, 60 "lahf_lm", "cmp_legacy", "svm", "extapic", "cr8legacy", "abm",
61 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 61 "sse4a", "misalignsse",
62 "3dnowprefetch", "osvw", "ibs", NULL, NULL, NULL, NULL, NULL,
62 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 64 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
64 }; 65 };
@@ -69,8 +70,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
69 "ttp", /* thermal trip */ 70 "ttp", /* thermal trip */
70 "tm", 71 "tm",
71 "stc", 72 "stc",
73 "100mhzsteps",
74 "hwpstate",
72 NULL, 75 NULL,
73 /* nothing */ /* constant_tsc - moved to flags */ 76 NULL, /* constant_tsc - moved to flags */
77 /* nothing */
74 }; 78 };
75 struct cpuinfo_x86 *c = v; 79 struct cpuinfo_x86 *c = v;
76 int i, n = c - cpu_data; 80 int i, n = c - cpu_data;
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index 4056fb7d2cdf..5678d46863c6 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -9,7 +9,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
9{ 9{
10 unsigned int cap_mask, uk, max, dummy; 10 unsigned int cap_mask, uk, max, dummy;
11 unsigned int cms_rev1, cms_rev2; 11 unsigned int cms_rev1, cms_rev2;
12 unsigned int cpu_rev, cpu_freq, cpu_flags, new_cpu_rev; 12 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
13 char cpu_info[65]; 13 char cpu_info[65];
14 14
15 get_model_name(c); /* Same as AMD/Cyrix */ 15 get_model_name(c); /* Same as AMD/Cyrix */
@@ -72,6 +72,9 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
72 wrmsr(0x80860004, ~0, uk); 72 wrmsr(0x80860004, ~0, uk);
73 c->x86_capability[0] = cpuid_edx(0x00000001); 73 c->x86_capability[0] = cpuid_edx(0x00000001);
74 wrmsr(0x80860004, cap_mask, uk); 74 wrmsr(0x80860004, cap_mask, uk);
75
76 /* All Transmeta CPUs have a constant TSC */
77 set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
75 78
76 /* If we can run i686 user-space code, call us an i686 */ 79 /* If we can run i686 user-space code, call us an i686 */
77#define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV) 80#define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV)
diff --git a/arch/i386/kernel/cpuid.c b/arch/i386/kernel/cpuid.c
index 4da75fa3208d..eeae0d992337 100644
--- a/arch/i386/kernel/cpuid.c
+++ b/arch/i386/kernel/cpuid.c
@@ -48,7 +48,6 @@ static struct class *cpuid_class;
48#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
49 49
50struct cpuid_command { 50struct cpuid_command {
51 int cpu;
52 u32 reg; 51 u32 reg;
53 u32 *data; 52 u32 *data;
54}; 53};
@@ -57,8 +56,7 @@ static void cpuid_smp_cpuid(void *cmd_block)
57{ 56{
58 struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; 57 struct cpuid_command *cmd = (struct cpuid_command *)cmd_block;
59 58
60 if (cmd->cpu == smp_processor_id()) 59 cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
61 cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2],
62 &cmd->data[3]); 60 &cmd->data[3]);
63} 61}
64 62
@@ -70,11 +68,10 @@ static inline void do_cpuid(int cpu, u32 reg, u32 * data)
70 if (cpu == smp_processor_id()) { 68 if (cpu == smp_processor_id()) {
71 cpuid(reg, &data[0], &data[1], &data[2], &data[3]); 69 cpuid(reg, &data[0], &data[1], &data[2], &data[3]);
72 } else { 70 } else {
73 cmd.cpu = cpu;
74 cmd.reg = reg; 71 cmd.reg = reg;
75 cmd.data = data; 72 cmd.data = data;
76 73
77 smp_call_function(cpuid_smp_cpuid, &cmd, 1, 1); 74 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
78 } 75 }
79 preempt_enable(); 76 preempt_enable();
80} 77}
diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c
index f391abcf7da9..70f39560846a 100644
--- a/arch/i386/kernel/e820.c
+++ b/arch/i386/kernel/e820.c
@@ -14,6 +14,7 @@
14#include <asm/pgtable.h> 14#include <asm/pgtable.h>
15#include <asm/page.h> 15#include <asm/page.h>
16#include <asm/e820.h> 16#include <asm/e820.h>
17#include <asm/setup.h>
17 18
18#ifdef CONFIG_EFI 19#ifdef CONFIG_EFI
19int efi_enabled = 0; 20int efi_enabled = 0;
@@ -156,21 +157,22 @@ static struct resource standard_io_resources[] = { {
156 .flags = IORESOURCE_BUSY | IORESOURCE_IO 157 .flags = IORESOURCE_BUSY | IORESOURCE_IO
157} }; 158} };
158 159
159static int romsignature(const unsigned char *x) 160#define ROMSIGNATURE 0xaa55
161
162static int __init romsignature(const unsigned char *rom)
160{ 163{
161 unsigned short sig; 164 unsigned short sig;
162 int ret = 0; 165
163 if (probe_kernel_address((const unsigned short *)x, sig) == 0) 166 return probe_kernel_address((const unsigned short *)rom, sig) == 0 &&
164 ret = (sig == 0xaa55); 167 sig == ROMSIGNATURE;
165 return ret;
166} 168}
167 169
168static int __init romchecksum(unsigned char *rom, unsigned long length) 170static int __init romchecksum(unsigned char *rom, unsigned long length)
169{ 171{
170 unsigned char *p, sum = 0; 172 unsigned char sum;
171 173
172 for (p = rom; p < rom + length; p++) 174 for (sum = 0; length; length--)
173 sum += *p; 175 sum += *rom++;
174 return sum == 0; 176 return sum == 0;
175} 177}
176 178
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5e47683fc63a..18bddcb8e9e8 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -30,7 +30,7 @@
30 * 18(%esp) - %eax 30 * 18(%esp) - %eax
31 * 1C(%esp) - %ds 31 * 1C(%esp) - %ds
32 * 20(%esp) - %es 32 * 20(%esp) - %es
33 * 24(%esp) - %gs 33 * 24(%esp) - %fs
34 * 28(%esp) - orig_eax 34 * 28(%esp) - orig_eax
35 * 2C(%esp) - %eip 35 * 2C(%esp) - %eip
36 * 30(%esp) - %cs 36 * 30(%esp) - %cs
@@ -99,9 +99,9 @@ VM_MASK = 0x00020000
99 99
100#define SAVE_ALL \ 100#define SAVE_ALL \
101 cld; \ 101 cld; \
102 pushl %gs; \ 102 pushl %fs; \
103 CFI_ADJUST_CFA_OFFSET 4;\ 103 CFI_ADJUST_CFA_OFFSET 4;\
104 /*CFI_REL_OFFSET gs, 0;*/\ 104 /*CFI_REL_OFFSET fs, 0;*/\
105 pushl %es; \ 105 pushl %es; \
106 CFI_ADJUST_CFA_OFFSET 4;\ 106 CFI_ADJUST_CFA_OFFSET 4;\
107 /*CFI_REL_OFFSET es, 0;*/\ 107 /*CFI_REL_OFFSET es, 0;*/\
@@ -133,7 +133,7 @@ VM_MASK = 0x00020000
133 movl %edx, %ds; \ 133 movl %edx, %ds; \
134 movl %edx, %es; \ 134 movl %edx, %es; \
135 movl $(__KERNEL_PDA), %edx; \ 135 movl $(__KERNEL_PDA), %edx; \
136 movl %edx, %gs 136 movl %edx, %fs
137 137
138#define RESTORE_INT_REGS \ 138#define RESTORE_INT_REGS \
139 popl %ebx; \ 139 popl %ebx; \
@@ -166,9 +166,9 @@ VM_MASK = 0x00020000
1662: popl %es; \ 1662: popl %es; \
167 CFI_ADJUST_CFA_OFFSET -4;\ 167 CFI_ADJUST_CFA_OFFSET -4;\
168 /*CFI_RESTORE es;*/\ 168 /*CFI_RESTORE es;*/\
1693: popl %gs; \ 1693: popl %fs; \
170 CFI_ADJUST_CFA_OFFSET -4;\ 170 CFI_ADJUST_CFA_OFFSET -4;\
171 /*CFI_RESTORE gs;*/\ 171 /*CFI_RESTORE fs;*/\
172.pushsection .fixup,"ax"; \ 172.pushsection .fixup,"ax"; \
1734: movl $0,(%esp); \ 1734: movl $0,(%esp); \
174 jmp 1b; \ 174 jmp 1b; \
@@ -227,6 +227,7 @@ ENTRY(ret_from_fork)
227 CFI_ADJUST_CFA_OFFSET -4 227 CFI_ADJUST_CFA_OFFSET -4
228 jmp syscall_exit 228 jmp syscall_exit
229 CFI_ENDPROC 229 CFI_ENDPROC
230END(ret_from_fork)
230 231
231/* 232/*
232 * Return to user mode is not as complex as all this looks, 233 * Return to user mode is not as complex as all this looks,
@@ -258,6 +259,7 @@ ENTRY(resume_userspace)
258 # int/exception return? 259 # int/exception return?
259 jne work_pending 260 jne work_pending
260 jmp restore_all 261 jmp restore_all
262END(ret_from_exception)
261 263
262#ifdef CONFIG_PREEMPT 264#ifdef CONFIG_PREEMPT
263ENTRY(resume_kernel) 265ENTRY(resume_kernel)
@@ -272,6 +274,7 @@ need_resched:
272 jz restore_all 274 jz restore_all
273 call preempt_schedule_irq 275 call preempt_schedule_irq
274 jmp need_resched 276 jmp need_resched
277END(resume_kernel)
275#endif 278#endif
276 CFI_ENDPROC 279 CFI_ENDPROC
277 280
@@ -349,16 +352,17 @@ sysenter_past_esp:
349 movl PT_OLDESP(%esp), %ecx 352 movl PT_OLDESP(%esp), %ecx
350 xorl %ebp,%ebp 353 xorl %ebp,%ebp
351 TRACE_IRQS_ON 354 TRACE_IRQS_ON
3521: mov PT_GS(%esp), %gs 3551: mov PT_FS(%esp), %fs
353 ENABLE_INTERRUPTS_SYSEXIT 356 ENABLE_INTERRUPTS_SYSEXIT
354 CFI_ENDPROC 357 CFI_ENDPROC
355.pushsection .fixup,"ax" 358.pushsection .fixup,"ax"
3562: movl $0,PT_GS(%esp) 3592: movl $0,PT_FS(%esp)
357 jmp 1b 360 jmp 1b
358.section __ex_table,"a" 361.section __ex_table,"a"
359 .align 4 362 .align 4
360 .long 1b,2b 363 .long 1b,2b
361.popsection 364.popsection
365ENDPROC(sysenter_entry)
362 366
363 # system call handler stub 367 # system call handler stub
364ENTRY(system_call) 368ENTRY(system_call)
@@ -459,6 +463,7 @@ ldt_ss:
459 CFI_ADJUST_CFA_OFFSET -8 463 CFI_ADJUST_CFA_OFFSET -8
460 jmp restore_nocheck 464 jmp restore_nocheck
461 CFI_ENDPROC 465 CFI_ENDPROC
466ENDPROC(system_call)
462 467
463 # perform work that needs to be done immediately before resumption 468 # perform work that needs to be done immediately before resumption
464 ALIGN 469 ALIGN
@@ -504,6 +509,7 @@ work_notifysig_v86:
504 xorl %edx, %edx 509 xorl %edx, %edx
505 call do_notify_resume 510 call do_notify_resume
506 jmp resume_userspace_sig 511 jmp resume_userspace_sig
512END(work_pending)
507 513
508 # perform syscall exit tracing 514 # perform syscall exit tracing
509 ALIGN 515 ALIGN
@@ -519,6 +525,7 @@ syscall_trace_entry:
519 cmpl $(nr_syscalls), %eax 525 cmpl $(nr_syscalls), %eax
520 jnae syscall_call 526 jnae syscall_call
521 jmp syscall_exit 527 jmp syscall_exit
528END(syscall_trace_entry)
522 529
523 # perform syscall exit tracing 530 # perform syscall exit tracing
524 ALIGN 531 ALIGN
@@ -532,6 +539,7 @@ syscall_exit_work:
532 movl $1, %edx 539 movl $1, %edx
533 call do_syscall_trace 540 call do_syscall_trace
534 jmp resume_userspace 541 jmp resume_userspace
542END(syscall_exit_work)
535 CFI_ENDPROC 543 CFI_ENDPROC
536 544
537 RING0_INT_FRAME # can't unwind into user space anyway 545 RING0_INT_FRAME # can't unwind into user space anyway
@@ -542,15 +550,17 @@ syscall_fault:
542 GET_THREAD_INFO(%ebp) 550 GET_THREAD_INFO(%ebp)
543 movl $-EFAULT,PT_EAX(%esp) 551 movl $-EFAULT,PT_EAX(%esp)
544 jmp resume_userspace 552 jmp resume_userspace
553END(syscall_fault)
545 554
546syscall_badsys: 555syscall_badsys:
547 movl $-ENOSYS,PT_EAX(%esp) 556 movl $-ENOSYS,PT_EAX(%esp)
548 jmp resume_userspace 557 jmp resume_userspace
558END(syscall_badsys)
549 CFI_ENDPROC 559 CFI_ENDPROC
550 560
551#define FIXUP_ESPFIX_STACK \ 561#define FIXUP_ESPFIX_STACK \
552 /* since we are on a wrong stack, we cant make it a C code :( */ \ 562 /* since we are on a wrong stack, we cant make it a C code :( */ \
553 movl %gs:PDA_cpu, %ebx; \ 563 movl %fs:PDA_cpu, %ebx; \
554 PER_CPU(cpu_gdt_descr, %ebx); \ 564 PER_CPU(cpu_gdt_descr, %ebx); \
555 movl GDS_address(%ebx), %ebx; \ 565 movl GDS_address(%ebx), %ebx; \
556 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ 566 GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
@@ -581,9 +591,9 @@ syscall_badsys:
581ENTRY(interrupt) 591ENTRY(interrupt)
582.text 592.text
583 593
584vector=0
585ENTRY(irq_entries_start) 594ENTRY(irq_entries_start)
586 RING0_INT_FRAME 595 RING0_INT_FRAME
596vector=0
587.rept NR_IRQS 597.rept NR_IRQS
588 ALIGN 598 ALIGN
589 .if vector 599 .if vector
@@ -592,11 +602,16 @@ ENTRY(irq_entries_start)
5921: pushl $~(vector) 6021: pushl $~(vector)
593 CFI_ADJUST_CFA_OFFSET 4 603 CFI_ADJUST_CFA_OFFSET 4
594 jmp common_interrupt 604 jmp common_interrupt
595.data 605 .previous
596 .long 1b 606 .long 1b
597.text 607 .text
598vector=vector+1 608vector=vector+1
599.endr 609.endr
610END(irq_entries_start)
611
612.previous
613END(interrupt)
614.previous
600 615
601/* 616/*
602 * the CPU automatically disables interrupts when executing an IRQ vector, 617 * the CPU automatically disables interrupts when executing an IRQ vector,
@@ -609,6 +624,7 @@ common_interrupt:
609 movl %esp,%eax 624 movl %esp,%eax
610 call do_IRQ 625 call do_IRQ
611 jmp ret_from_intr 626 jmp ret_from_intr
627ENDPROC(common_interrupt)
612 CFI_ENDPROC 628 CFI_ENDPROC
613 629
614#define BUILD_INTERRUPT(name, nr) \ 630#define BUILD_INTERRUPT(name, nr) \
@@ -621,18 +637,24 @@ ENTRY(name) \
621 movl %esp,%eax; \ 637 movl %esp,%eax; \
622 call smp_/**/name; \ 638 call smp_/**/name; \
623 jmp ret_from_intr; \ 639 jmp ret_from_intr; \
624 CFI_ENDPROC 640 CFI_ENDPROC; \
641ENDPROC(name)
625 642
626/* The include is where all of the SMP etc. interrupts come from */ 643/* The include is where all of the SMP etc. interrupts come from */
627#include "entry_arch.h" 644#include "entry_arch.h"
628 645
646/* This alternate entry is needed because we hijack the apic LVTT */
647#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC)
648BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR)
649#endif
650
629KPROBE_ENTRY(page_fault) 651KPROBE_ENTRY(page_fault)
630 RING0_EC_FRAME 652 RING0_EC_FRAME
631 pushl $do_page_fault 653 pushl $do_page_fault
632 CFI_ADJUST_CFA_OFFSET 4 654 CFI_ADJUST_CFA_OFFSET 4
633 ALIGN 655 ALIGN
634error_code: 656error_code:
635 /* the function address is in %gs's slot on the stack */ 657 /* the function address is in %fs's slot on the stack */
636 pushl %es 658 pushl %es
637 CFI_ADJUST_CFA_OFFSET 4 659 CFI_ADJUST_CFA_OFFSET 4
638 /*CFI_REL_OFFSET es, 0*/ 660 /*CFI_REL_OFFSET es, 0*/
@@ -661,20 +683,20 @@ error_code:
661 CFI_ADJUST_CFA_OFFSET 4 683 CFI_ADJUST_CFA_OFFSET 4
662 CFI_REL_OFFSET ebx, 0 684 CFI_REL_OFFSET ebx, 0
663 cld 685 cld
664 pushl %gs 686 pushl %fs
665 CFI_ADJUST_CFA_OFFSET 4 687 CFI_ADJUST_CFA_OFFSET 4
666 /*CFI_REL_OFFSET gs, 0*/ 688 /*CFI_REL_OFFSET fs, 0*/
667 movl $(__KERNEL_PDA), %ecx 689 movl $(__KERNEL_PDA), %ecx
668 movl %ecx, %gs 690 movl %ecx, %fs
669 UNWIND_ESPFIX_STACK 691 UNWIND_ESPFIX_STACK
670 popl %ecx 692 popl %ecx
671 CFI_ADJUST_CFA_OFFSET -4 693 CFI_ADJUST_CFA_OFFSET -4
672 /*CFI_REGISTER es, ecx*/ 694 /*CFI_REGISTER es, ecx*/
673 movl PT_GS(%esp), %edi # get the function address 695 movl PT_FS(%esp), %edi # get the function address
674 movl PT_ORIG_EAX(%esp), %edx # get the error code 696 movl PT_ORIG_EAX(%esp), %edx # get the error code
675 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart 697 movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
676 mov %ecx, PT_GS(%esp) 698 mov %ecx, PT_FS(%esp)
677 /*CFI_REL_OFFSET gs, ES*/ 699 /*CFI_REL_OFFSET fs, ES*/
678 movl $(__USER_DS), %ecx 700 movl $(__USER_DS), %ecx
679 movl %ecx, %ds 701 movl %ecx, %ds
680 movl %ecx, %es 702 movl %ecx, %es
@@ -692,6 +714,7 @@ ENTRY(coprocessor_error)
692 CFI_ADJUST_CFA_OFFSET 4 714 CFI_ADJUST_CFA_OFFSET 4
693 jmp error_code 715 jmp error_code
694 CFI_ENDPROC 716 CFI_ENDPROC
717END(coprocessor_error)
695 718
696ENTRY(simd_coprocessor_error) 719ENTRY(simd_coprocessor_error)
697 RING0_INT_FRAME 720 RING0_INT_FRAME
@@ -701,6 +724,7 @@ ENTRY(simd_coprocessor_error)
701 CFI_ADJUST_CFA_OFFSET 4 724 CFI_ADJUST_CFA_OFFSET 4
702 jmp error_code 725 jmp error_code
703 CFI_ENDPROC 726 CFI_ENDPROC
727END(simd_coprocessor_error)
704 728
705ENTRY(device_not_available) 729ENTRY(device_not_available)
706 RING0_INT_FRAME 730 RING0_INT_FRAME
@@ -721,6 +745,7 @@ device_not_available_emulate:
721 CFI_ADJUST_CFA_OFFSET -4 745 CFI_ADJUST_CFA_OFFSET -4
722 jmp ret_from_exception 746 jmp ret_from_exception
723 CFI_ENDPROC 747 CFI_ENDPROC
748END(device_not_available)
724 749
725/* 750/*
726 * Debug traps and NMI can happen at the one SYSENTER instruction 751 * Debug traps and NMI can happen at the one SYSENTER instruction
@@ -864,10 +889,12 @@ ENTRY(native_iret)
864 .align 4 889 .align 4
865 .long 1b,iret_exc 890 .long 1b,iret_exc
866.previous 891.previous
892END(native_iret)
867 893
868ENTRY(native_irq_enable_sysexit) 894ENTRY(native_irq_enable_sysexit)
869 sti 895 sti
870 sysexit 896 sysexit
897END(native_irq_enable_sysexit)
871#endif 898#endif
872 899
873KPROBE_ENTRY(int3) 900KPROBE_ENTRY(int3)
@@ -890,6 +917,7 @@ ENTRY(overflow)
890 CFI_ADJUST_CFA_OFFSET 4 917 CFI_ADJUST_CFA_OFFSET 4
891 jmp error_code 918 jmp error_code
892 CFI_ENDPROC 919 CFI_ENDPROC
920END(overflow)
893 921
894ENTRY(bounds) 922ENTRY(bounds)
895 RING0_INT_FRAME 923 RING0_INT_FRAME
@@ -899,6 +927,7 @@ ENTRY(bounds)
899 CFI_ADJUST_CFA_OFFSET 4 927 CFI_ADJUST_CFA_OFFSET 4
900 jmp error_code 928 jmp error_code
901 CFI_ENDPROC 929 CFI_ENDPROC
930END(bounds)
902 931
903ENTRY(invalid_op) 932ENTRY(invalid_op)
904 RING0_INT_FRAME 933 RING0_INT_FRAME
@@ -908,6 +937,7 @@ ENTRY(invalid_op)
908 CFI_ADJUST_CFA_OFFSET 4 937 CFI_ADJUST_CFA_OFFSET 4
909 jmp error_code 938 jmp error_code
910 CFI_ENDPROC 939 CFI_ENDPROC
940END(invalid_op)
911 941
912ENTRY(coprocessor_segment_overrun) 942ENTRY(coprocessor_segment_overrun)
913 RING0_INT_FRAME 943 RING0_INT_FRAME
@@ -917,6 +947,7 @@ ENTRY(coprocessor_segment_overrun)
917 CFI_ADJUST_CFA_OFFSET 4 947 CFI_ADJUST_CFA_OFFSET 4
918 jmp error_code 948 jmp error_code
919 CFI_ENDPROC 949 CFI_ENDPROC
950END(coprocessor_segment_overrun)
920 951
921ENTRY(invalid_TSS) 952ENTRY(invalid_TSS)
922 RING0_EC_FRAME 953 RING0_EC_FRAME
@@ -924,6 +955,7 @@ ENTRY(invalid_TSS)
924 CFI_ADJUST_CFA_OFFSET 4 955 CFI_ADJUST_CFA_OFFSET 4
925 jmp error_code 956 jmp error_code
926 CFI_ENDPROC 957 CFI_ENDPROC
958END(invalid_TSS)
927 959
928ENTRY(segment_not_present) 960ENTRY(segment_not_present)
929 RING0_EC_FRAME 961 RING0_EC_FRAME
@@ -931,6 +963,7 @@ ENTRY(segment_not_present)
931 CFI_ADJUST_CFA_OFFSET 4 963 CFI_ADJUST_CFA_OFFSET 4
932 jmp error_code 964 jmp error_code
933 CFI_ENDPROC 965 CFI_ENDPROC
966END(segment_not_present)
934 967
935ENTRY(stack_segment) 968ENTRY(stack_segment)
936 RING0_EC_FRAME 969 RING0_EC_FRAME
@@ -938,6 +971,7 @@ ENTRY(stack_segment)
938 CFI_ADJUST_CFA_OFFSET 4 971 CFI_ADJUST_CFA_OFFSET 4
939 jmp error_code 972 jmp error_code
940 CFI_ENDPROC 973 CFI_ENDPROC
974END(stack_segment)
941 975
942KPROBE_ENTRY(general_protection) 976KPROBE_ENTRY(general_protection)
943 RING0_EC_FRAME 977 RING0_EC_FRAME
@@ -953,6 +987,7 @@ ENTRY(alignment_check)
953 CFI_ADJUST_CFA_OFFSET 4 987 CFI_ADJUST_CFA_OFFSET 4
954 jmp error_code 988 jmp error_code
955 CFI_ENDPROC 989 CFI_ENDPROC
990END(alignment_check)
956 991
957ENTRY(divide_error) 992ENTRY(divide_error)
958 RING0_INT_FRAME 993 RING0_INT_FRAME
@@ -962,6 +997,7 @@ ENTRY(divide_error)
962 CFI_ADJUST_CFA_OFFSET 4 997 CFI_ADJUST_CFA_OFFSET 4
963 jmp error_code 998 jmp error_code
964 CFI_ENDPROC 999 CFI_ENDPROC
1000END(divide_error)
965 1001
966#ifdef CONFIG_X86_MCE 1002#ifdef CONFIG_X86_MCE
967ENTRY(machine_check) 1003ENTRY(machine_check)
@@ -972,6 +1008,7 @@ ENTRY(machine_check)
972 CFI_ADJUST_CFA_OFFSET 4 1008 CFI_ADJUST_CFA_OFFSET 4
973 jmp error_code 1009 jmp error_code
974 CFI_ENDPROC 1010 CFI_ENDPROC
1011END(machine_check)
975#endif 1012#endif
976 1013
977ENTRY(spurious_interrupt_bug) 1014ENTRY(spurious_interrupt_bug)
@@ -982,6 +1019,7 @@ ENTRY(spurious_interrupt_bug)
982 CFI_ADJUST_CFA_OFFSET 4 1019 CFI_ADJUST_CFA_OFFSET 4
983 jmp error_code 1020 jmp error_code
984 CFI_ENDPROC 1021 CFI_ENDPROC
1022END(spurious_interrupt_bug)
985 1023
986ENTRY(kernel_thread_helper) 1024ENTRY(kernel_thread_helper)
987 pushl $0 # fake return address for unwinder 1025 pushl $0 # fake return address for unwinder
diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index cb9abdfced9b..3fa7f9389afe 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -53,6 +53,7 @@
53 * any particular GDT layout, because we load our own as soon as we 53 * any particular GDT layout, because we load our own as soon as we
54 * can. 54 * can.
55 */ 55 */
56.section .text.head,"ax",@progbits
56ENTRY(startup_32) 57ENTRY(startup_32)
57 58
58#ifdef CONFIG_PARAVIRT 59#ifdef CONFIG_PARAVIRT
@@ -141,16 +142,25 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
141 jb 10b 142 jb 10b
142 movl %edi,(init_pg_tables_end - __PAGE_OFFSET) 143 movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
143 144
144#ifdef CONFIG_SMP
145 xorl %ebx,%ebx /* This is the boot CPU (BSP) */ 145 xorl %ebx,%ebx /* This is the boot CPU (BSP) */
146 jmp 3f 146 jmp 3f
147
148/* 147/*
149 * Non-boot CPU entry point; entered from trampoline.S 148 * Non-boot CPU entry point; entered from trampoline.S
150 * We can't lgdt here, because lgdt itself uses a data segment, but 149 * We can't lgdt here, because lgdt itself uses a data segment, but
151 * we know the trampoline has already loaded the boot_gdt_table GDT 150 * we know the trampoline has already loaded the boot_gdt_table GDT
152 * for us. 151 * for us.
152 *
153 * If cpu hotplug is not supported then this code can go in init section
154 * which will be freed later
153 */ 155 */
156
157#ifdef CONFIG_HOTPLUG_CPU
158.section .text,"ax",@progbits
159#else
160.section .init.text,"ax",@progbits
161#endif
162
163#ifdef CONFIG_SMP
154ENTRY(startup_32_smp) 164ENTRY(startup_32_smp)
155 cld 165 cld
156 movl $(__BOOT_DS),%eax 166 movl $(__BOOT_DS),%eax
@@ -208,8 +218,8 @@ ENTRY(startup_32_smp)
208 xorl %ebx,%ebx 218 xorl %ebx,%ebx
209 incl %ebx 219 incl %ebx
210 220
2113:
212#endif /* CONFIG_SMP */ 221#endif /* CONFIG_SMP */
2223:
213 223
214/* 224/*
215 * Enable paging 225 * Enable paging
@@ -309,7 +319,7 @@ is386: movl $2,%ecx # set MP
309 319
310 call check_x87 320 call check_x87
311 call setup_pda 321 call setup_pda
312 lgdt cpu_gdt_descr 322 lgdt early_gdt_descr
313 lidt idt_descr 323 lidt idt_descr
314 ljmp $(__KERNEL_CS),$1f 324 ljmp $(__KERNEL_CS),$1f
3151: movl $(__KERNEL_DS),%eax # reload all the segment registers 3251: movl $(__KERNEL_DS),%eax # reload all the segment registers
@@ -319,12 +329,12 @@ is386: movl $2,%ecx # set MP
319 movl %eax,%ds 329 movl %eax,%ds
320 movl %eax,%es 330 movl %eax,%es
321 331
322 xorl %eax,%eax # Clear FS and LDT 332 xorl %eax,%eax # Clear GS and LDT
323 movl %eax,%fs 333 movl %eax,%gs
324 lldt %ax 334 lldt %ax
325 335
326 movl $(__KERNEL_PDA),%eax 336 movl $(__KERNEL_PDA),%eax
327 mov %eax,%gs 337 mov %eax,%fs
328 338
329 cld # gcc2 wants the direction flag cleared at all times 339 cld # gcc2 wants the direction flag cleared at all times
330 pushl $0 # fake return address for unwinder 340 pushl $0 # fake return address for unwinder
@@ -360,12 +370,12 @@ check_x87:
360 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be 370 * cpu_gdt_table and boot_pda; for secondary CPUs, these will be
361 * that CPU's GDT and PDA. 371 * that CPU's GDT and PDA.
362 */ 372 */
363setup_pda: 373ENTRY(setup_pda)
364 /* get the PDA pointer */ 374 /* get the PDA pointer */
365 movl start_pda, %eax 375 movl start_pda, %eax
366 376
367 /* slot the PDA address into the GDT */ 377 /* slot the PDA address into the GDT */
368 mov cpu_gdt_descr+2, %ecx 378 mov early_gdt_descr+2, %ecx
369 mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ 379 mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */
370 shr $16, %eax 380 shr $16, %eax
371 mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ 381 mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */
@@ -492,6 +502,7 @@ ignore_int:
492#endif 502#endif
493 iret 503 iret
494 504
505.section .text
495#ifdef CONFIG_PARAVIRT 506#ifdef CONFIG_PARAVIRT
496startup_paravirt: 507startup_paravirt:
497 cld 508 cld
@@ -502,10 +513,11 @@ startup_paravirt:
502 pushl %ecx 513 pushl %ecx
503 pushl %eax 514 pushl %eax
504 515
505 /* paravirt.o is last in link, and that probe fn never returns */
506 pushl $__start_paravirtprobe 516 pushl $__start_paravirtprobe
5071: 5171:
508 movl 0(%esp), %eax 518 movl 0(%esp), %eax
519 cmpl $__stop_paravirtprobe, %eax
520 je unhandled_paravirt
509 pushl (%eax) 521 pushl (%eax)
510 movl 8(%esp), %eax 522 movl 8(%esp), %eax
511 call *(%esp) 523 call *(%esp)
@@ -517,6 +529,10 @@ startup_paravirt:
517 529
518 addl $4, (%esp) 530 addl $4, (%esp)
519 jmp 1b 531 jmp 1b
532
533unhandled_paravirt:
534 /* Nothing wanted us: we're screwed. */
535 ud2
520#endif 536#endif
521 537
522/* 538/*
@@ -581,7 +597,7 @@ idt_descr:
581 597
582# boot GDT descriptor (later on used by CPU#0): 598# boot GDT descriptor (later on used by CPU#0):
583 .word 0 # 32 bit align gdt_desc.address 599 .word 0 # 32 bit align gdt_desc.address
584ENTRY(cpu_gdt_descr) 600ENTRY(early_gdt_descr)
585 .word GDT_ENTRIES*8-1 601 .word GDT_ENTRIES*8-1
586 .long cpu_gdt_table 602 .long cpu_gdt_table
587 603
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index ba8d302a0b72..e30ccedad0b9 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -1920,7 +1920,7 @@ static void __init setup_ioapic_ids_from_mpc(void)
1920static void __init setup_ioapic_ids_from_mpc(void) { } 1920static void __init setup_ioapic_ids_from_mpc(void) { }
1921#endif 1921#endif
1922 1922
1923static int no_timer_check __initdata; 1923int no_timer_check __initdata;
1924 1924
1925static int __init notimercheck(char *s) 1925static int __init notimercheck(char *s)
1926{ 1926{
@@ -2310,7 +2310,7 @@ static inline void __init check_timer(void)
2310 2310
2311 disable_8259A_irq(0); 2311 disable_8259A_irq(0);
2312 set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, 2312 set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq,
2313 "fasteio"); 2313 "fasteoi");
2314 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ 2314 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2315 enable_8259A_irq(0); 2315 enable_8259A_irq(0);
2316 2316
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 3201d421090a..5785d84103a6 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -19,6 +19,8 @@
19#include <linux/cpu.h> 19#include <linux/cpu.h>
20#include <linux/delay.h> 20#include <linux/delay.h>
21 21
22#include <asm/idle.h>
23
22DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; 24DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp;
23EXPORT_PER_CPU_SYMBOL(irq_stat); 25EXPORT_PER_CPU_SYMBOL(irq_stat);
24 26
@@ -61,6 +63,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs)
61 union irq_ctx *curctx, *irqctx; 63 union irq_ctx *curctx, *irqctx;
62 u32 *isp; 64 u32 *isp;
63#endif 65#endif
66 exit_idle();
64 67
65 if (unlikely((unsigned)irq >= NR_IRQS)) { 68 if (unlikely((unsigned)irq >= NR_IRQS)) {
66 printk(KERN_EMERG "%s: cannot handle IRQ %d\n", 69 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index af1d53344993..b545bc746fce 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -363,7 +363,7 @@ no_kprobe:
363 " pushf\n" 363 " pushf\n"
364 /* skip cs, eip, orig_eax */ 364 /* skip cs, eip, orig_eax */
365 " subl $12, %esp\n" 365 " subl $12, %esp\n"
366 " pushl %gs\n" 366 " pushl %fs\n"
367 " pushl %ds\n" 367 " pushl %ds\n"
368 " pushl %es\n" 368 " pushl %es\n"
369 " pushl %eax\n" 369 " pushl %eax\n"
@@ -387,7 +387,7 @@ no_kprobe:
387 " popl %edi\n" 387 " popl %edi\n"
388 " popl %ebp\n" 388 " popl %ebp\n"
389 " popl %eax\n" 389 " popl %eax\n"
390 /* skip eip, orig_eax, es, ds, gs */ 390 /* skip eip, orig_eax, es, ds, fs */
391 " addl $20, %esp\n" 391 " addl $20, %esp\n"
392 " popf\n" 392 " popf\n"
393 " ret\n"); 393 " ret\n");
@@ -408,7 +408,7 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs)
408 spin_lock_irqsave(&kretprobe_lock, flags); 408 spin_lock_irqsave(&kretprobe_lock, flags);
409 head = kretprobe_inst_table_head(current); 409 head = kretprobe_inst_table_head(current);
410 /* fixup registers */ 410 /* fixup registers */
411 regs->xcs = __KERNEL_CS; 411 regs->xcs = __KERNEL_CS | get_kernel_rpl();
412 regs->eip = trampoline_address; 412 regs->eip = trampoline_address;
413 regs->orig_eax = 0xffffffff; 413 regs->orig_eax = 0xffffffff;
414 414
diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index 381252bae3d8..b8f16633a6ec 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -384,7 +384,7 @@ static int do_microcode_update (void)
384{ 384{
385 long cursor = 0; 385 long cursor = 0;
386 int error = 0; 386 int error = 0;
387 void *new_mc; 387 void *new_mc = NULL;
388 int cpu; 388 int cpu;
389 cpumask_t old; 389 cpumask_t old;
390 390
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index 4e14264f392a..bcaa6e9b6197 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -68,7 +68,6 @@ static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx)
68#ifdef CONFIG_SMP 68#ifdef CONFIG_SMP
69 69
70struct msr_command { 70struct msr_command {
71 int cpu;
72 int err; 71 int err;
73 u32 reg; 72 u32 reg;
74 u32 data[2]; 73 u32 data[2];
@@ -78,16 +77,14 @@ static void msr_smp_wrmsr(void *cmd_block)
78{ 77{
79 struct msr_command *cmd = (struct msr_command *)cmd_block; 78 struct msr_command *cmd = (struct msr_command *)cmd_block;
80 79
81 if (cmd->cpu == smp_processor_id()) 80 cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
82 cmd->err = wrmsr_eio(cmd->reg, cmd->data[0], cmd->data[1]);
83} 81}
84 82
85static void msr_smp_rdmsr(void *cmd_block) 83static void msr_smp_rdmsr(void *cmd_block)
86{ 84{
87 struct msr_command *cmd = (struct msr_command *)cmd_block; 85 struct msr_command *cmd = (struct msr_command *)cmd_block;
88 86
89 if (cmd->cpu == smp_processor_id()) 87 cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
90 cmd->err = rdmsr_eio(cmd->reg, &cmd->data[0], &cmd->data[1]);
91} 88}
92 89
93static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx) 90static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
@@ -99,12 +96,11 @@ static inline int do_wrmsr(int cpu, u32 reg, u32 eax, u32 edx)
99 if (cpu == smp_processor_id()) { 96 if (cpu == smp_processor_id()) {
100 ret = wrmsr_eio(reg, eax, edx); 97 ret = wrmsr_eio(reg, eax, edx);
101 } else { 98 } else {
102 cmd.cpu = cpu;
103 cmd.reg = reg; 99 cmd.reg = reg;
104 cmd.data[0] = eax; 100 cmd.data[0] = eax;
105 cmd.data[1] = edx; 101 cmd.data[1] = edx;
106 102
107 smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); 103 smp_call_function_single(cpu, msr_smp_wrmsr, &cmd, 1, 1);
108 ret = cmd.err; 104 ret = cmd.err;
109 } 105 }
110 preempt_enable(); 106 preempt_enable();
@@ -120,10 +116,9 @@ static inline int do_rdmsr(int cpu, u32 reg, u32 * eax, u32 * edx)
120 if (cpu == smp_processor_id()) { 116 if (cpu == smp_processor_id()) {
121 ret = rdmsr_eio(reg, eax, edx); 117 ret = rdmsr_eio(reg, eax, edx);
122 } else { 118 } else {
123 cmd.cpu = cpu;
124 cmd.reg = reg; 119 cmd.reg = reg;
125 120
126 smp_call_function(msr_smp_rdmsr, &cmd, 1, 1); 121 smp_call_function_single(cpu, msr_smp_rdmsr, &cmd, 1, 1);
127 122
128 *eax = cmd.data[0]; 123 *eax = cmd.data[0];
129 *edx = cmd.data[1]; 124 *edx = cmd.data[1];
diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 1a6f8bb8881c..5d8a07c20281 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -185,7 +185,8 @@ static __cpuinit inline int nmi_known_cpu(void)
185{ 185{
186 switch (boot_cpu_data.x86_vendor) { 186 switch (boot_cpu_data.x86_vendor) {
187 case X86_VENDOR_AMD: 187 case X86_VENDOR_AMD:
188 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)); 188 return ((boot_cpu_data.x86 == 15) || (boot_cpu_data.x86 == 6)
189 || (boot_cpu_data.x86 == 16));
189 case X86_VENDOR_INTEL: 190 case X86_VENDOR_INTEL:
190 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 191 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
191 return 1; 192 return 1;
@@ -216,6 +217,28 @@ static __init void nmi_cpu_busy(void *data)
216} 217}
217#endif 218#endif
218 219
220static unsigned int adjust_for_32bit_ctr(unsigned int hz)
221{
222 u64 counter_val;
223 unsigned int retval = hz;
224
225 /*
226 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
227 * are writable, with higher bits sign extending from bit 31.
228 * So, we can only program the counter with 31 bit values and
229 * 32nd bit should be 1, for 33.. to be 1.
230 * Find the appropriate nmi_hz
231 */
232 counter_val = (u64)cpu_khz * 1000;
233 do_div(counter_val, retval);
234 if (counter_val > 0x7fffffffULL) {
235 u64 count = (u64)cpu_khz * 1000;
236 do_div(count, 0x7fffffffUL);
237 retval = count + 1;
238 }
239 return retval;
240}
241
219static int __init check_nmi_watchdog(void) 242static int __init check_nmi_watchdog(void)
220{ 243{
221 unsigned int *prev_nmi_count; 244 unsigned int *prev_nmi_count;
@@ -281,18 +304,10 @@ static int __init check_nmi_watchdog(void)
281 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 304 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
282 305
283 nmi_hz = 1; 306 nmi_hz = 1;
284 /* 307
285 * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter 308 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
286 * are writable, with higher bits sign extending from bit 31. 309 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
287 * So, we can only program the counter with 31 bit values and 310 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
288 * 32nd bit should be 1, for 33.. to be 1.
289 * Find the appropriate nmi_hz
290 */
291 if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0 &&
292 ((u64)cpu_khz * 1000) > 0x7fffffffULL) {
293 u64 count = (u64)cpu_khz * 1000;
294 do_div(count, 0x7fffffffUL);
295 nmi_hz = count + 1;
296 } 311 }
297 } 312 }
298 313
@@ -369,6 +384,34 @@ void enable_timer_nmi_watchdog(void)
369 } 384 }
370} 385}
371 386
387static void __acpi_nmi_disable(void *__unused)
388{
389 apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
390}
391
392/*
393 * Disable timer based NMIs on all CPUs:
394 */
395void acpi_nmi_disable(void)
396{
397 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
398 on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
399}
400
401static void __acpi_nmi_enable(void *__unused)
402{
403 apic_write_around(APIC_LVT0, APIC_DM_NMI);
404}
405
406/*
407 * Enable timer based NMIs on all CPUs:
408 */
409void acpi_nmi_enable(void)
410{
411 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
412 on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
413}
414
372#ifdef CONFIG_PM 415#ifdef CONFIG_PM
373 416
374static int nmi_pm_active; /* nmi_active before suspend */ 417static int nmi_pm_active; /* nmi_active before suspend */
@@ -442,6 +485,17 @@ static void write_watchdog_counter(unsigned int perfctr_msr, const char *descr)
442 wrmsrl(perfctr_msr, 0 - count); 485 wrmsrl(perfctr_msr, 0 - count);
443} 486}
444 487
488static void write_watchdog_counter32(unsigned int perfctr_msr,
489 const char *descr)
490{
491 u64 count = (u64)cpu_khz * 1000;
492
493 do_div(count, nmi_hz);
494 if(descr)
495 Dprintk("setting %s to -0x%08Lx\n", descr, count);
496 wrmsr(perfctr_msr, (u32)(-count), 0);
497}
498
445/* Note that these events don't tick when the CPU idles. This means 499/* Note that these events don't tick when the CPU idles. This means
446 the frequency varies with CPU load. */ 500 the frequency varies with CPU load. */
447 501
@@ -531,7 +585,8 @@ static int setup_p6_watchdog(void)
531 585
532 /* setup the timer */ 586 /* setup the timer */
533 wrmsr(evntsel_msr, evntsel, 0); 587 wrmsr(evntsel_msr, evntsel, 0);
534 write_watchdog_counter(perfctr_msr, "P6_PERFCTR0"); 588 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
589 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0");
535 apic_write(APIC_LVTPC, APIC_DM_NMI); 590 apic_write(APIC_LVTPC, APIC_DM_NMI);
536 evntsel |= P6_EVNTSEL0_ENABLE; 591 evntsel |= P6_EVNTSEL0_ENABLE;
537 wrmsr(evntsel_msr, evntsel, 0); 592 wrmsr(evntsel_msr, evntsel, 0);
@@ -704,7 +759,8 @@ static int setup_intel_arch_watchdog(void)
704 759
705 /* setup the timer */ 760 /* setup the timer */
706 wrmsr(evntsel_msr, evntsel, 0); 761 wrmsr(evntsel_msr, evntsel, 0);
707 write_watchdog_counter(perfctr_msr, "INTEL_ARCH_PERFCTR0"); 762 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
763 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0");
708 apic_write(APIC_LVTPC, APIC_DM_NMI); 764 apic_write(APIC_LVTPC, APIC_DM_NMI);
709 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; 765 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
710 wrmsr(evntsel_msr, evntsel, 0); 766 wrmsr(evntsel_msr, evntsel, 0);
@@ -762,7 +818,8 @@ void setup_apic_nmi_watchdog (void *unused)
762 if (nmi_watchdog == NMI_LOCAL_APIC) { 818 if (nmi_watchdog == NMI_LOCAL_APIC) {
763 switch (boot_cpu_data.x86_vendor) { 819 switch (boot_cpu_data.x86_vendor) {
764 case X86_VENDOR_AMD: 820 case X86_VENDOR_AMD:
765 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15) 821 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
822 boot_cpu_data.x86 != 16)
766 return; 823 return;
767 if (!setup_k7_watchdog()) 824 if (!setup_k7_watchdog())
768 return; 825 return;
@@ -956,6 +1013,8 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
956 dummy &= ~P4_CCCR_OVF; 1013 dummy &= ~P4_CCCR_OVF;
957 wrmsrl(wd->cccr_msr, dummy); 1014 wrmsrl(wd->cccr_msr, dummy);
958 apic_write(APIC_LVTPC, APIC_DM_NMI); 1015 apic_write(APIC_LVTPC, APIC_DM_NMI);
1016 /* start the cycle over again */
1017 write_watchdog_counter(wd->perfctr_msr, NULL);
959 } 1018 }
960 else if (wd->perfctr_msr == MSR_P6_PERFCTR0 || 1019 else if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
961 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { 1020 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) {
@@ -964,9 +1023,12 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
964 * other P6 variant. 1023 * other P6 variant.
965 * ArchPerfom/Core Duo also needs this */ 1024 * ArchPerfom/Core Duo also needs this */
966 apic_write(APIC_LVTPC, APIC_DM_NMI); 1025 apic_write(APIC_LVTPC, APIC_DM_NMI);
1026 /* P6/ARCH_PERFMON has 32 bit counter write */
1027 write_watchdog_counter32(wd->perfctr_msr, NULL);
1028 } else {
1029 /* start the cycle over again */
1030 write_watchdog_counter(wd->perfctr_msr, NULL);
967 } 1031 }
968 /* start the cycle over again */
969 write_watchdog_counter(wd->perfctr_msr, NULL);
970 rc = 1; 1032 rc = 1;
971 } else if (nmi_watchdog == NMI_IO_APIC) { 1033 } else if (nmi_watchdog == NMI_IO_APIC) {
972 /* don't know how to accurately check for this. 1034 /* don't know how to accurately check for this.
diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index e55fd05da0f5..c156ecfa3872 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -92,7 +92,7 @@ static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
92 return insn_len; 92 return insn_len;
93} 93}
94 94
95static fastcall unsigned long native_get_debugreg(int regno) 95static unsigned long native_get_debugreg(int regno)
96{ 96{
97 unsigned long val = 0; /* Damn you, gcc! */ 97 unsigned long val = 0; /* Damn you, gcc! */
98 98
@@ -115,7 +115,7 @@ static fastcall unsigned long native_get_debugreg(int regno)
115 return val; 115 return val;
116} 116}
117 117
118static fastcall void native_set_debugreg(int regno, unsigned long value) 118static void native_set_debugreg(int regno, unsigned long value)
119{ 119{
120 switch (regno) { 120 switch (regno) {
121 case 0: 121 case 0:
@@ -146,55 +146,55 @@ void init_IRQ(void)
146 paravirt_ops.init_IRQ(); 146 paravirt_ops.init_IRQ();
147} 147}
148 148
149static fastcall void native_clts(void) 149static void native_clts(void)
150{ 150{
151 asm volatile ("clts"); 151 asm volatile ("clts");
152} 152}
153 153
154static fastcall unsigned long native_read_cr0(void) 154static unsigned long native_read_cr0(void)
155{ 155{
156 unsigned long val; 156 unsigned long val;
157 asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); 157 asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
158 return val; 158 return val;
159} 159}
160 160
161static fastcall void native_write_cr0(unsigned long val) 161static void native_write_cr0(unsigned long val)
162{ 162{
163 asm volatile("movl %0,%%cr0": :"r" (val)); 163 asm volatile("movl %0,%%cr0": :"r" (val));
164} 164}
165 165
166static fastcall unsigned long native_read_cr2(void) 166static unsigned long native_read_cr2(void)
167{ 167{
168 unsigned long val; 168 unsigned long val;
169 asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); 169 asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
170 return val; 170 return val;
171} 171}
172 172
173static fastcall void native_write_cr2(unsigned long val) 173static void native_write_cr2(unsigned long val)
174{ 174{
175 asm volatile("movl %0,%%cr2": :"r" (val)); 175 asm volatile("movl %0,%%cr2": :"r" (val));
176} 176}
177 177
178static fastcall unsigned long native_read_cr3(void) 178static unsigned long native_read_cr3(void)
179{ 179{
180 unsigned long val; 180 unsigned long val;
181 asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); 181 asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
182 return val; 182 return val;
183} 183}
184 184
185static fastcall void native_write_cr3(unsigned long val) 185static void native_write_cr3(unsigned long val)
186{ 186{
187 asm volatile("movl %0,%%cr3": :"r" (val)); 187 asm volatile("movl %0,%%cr3": :"r" (val));
188} 188}
189 189
190static fastcall unsigned long native_read_cr4(void) 190static unsigned long native_read_cr4(void)
191{ 191{
192 unsigned long val; 192 unsigned long val;
193 asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); 193 asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
194 return val; 194 return val;
195} 195}
196 196
197static fastcall unsigned long native_read_cr4_safe(void) 197static unsigned long native_read_cr4_safe(void)
198{ 198{
199 unsigned long val; 199 unsigned long val;
200 /* This could fault if %cr4 does not exist */ 200 /* This could fault if %cr4 does not exist */
@@ -207,51 +207,51 @@ static fastcall unsigned long native_read_cr4_safe(void)
207 return val; 207 return val;
208} 208}
209 209
210static fastcall void native_write_cr4(unsigned long val) 210static void native_write_cr4(unsigned long val)
211{ 211{
212 asm volatile("movl %0,%%cr4": :"r" (val)); 212 asm volatile("movl %0,%%cr4": :"r" (val));
213} 213}
214 214
215static fastcall unsigned long native_save_fl(void) 215static unsigned long native_save_fl(void)
216{ 216{
217 unsigned long f; 217 unsigned long f;
218 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); 218 asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
219 return f; 219 return f;
220} 220}
221 221
222static fastcall void native_restore_fl(unsigned long f) 222static void native_restore_fl(unsigned long f)
223{ 223{
224 asm volatile("pushl %0 ; popfl": /* no output */ 224 asm volatile("pushl %0 ; popfl": /* no output */
225 :"g" (f) 225 :"g" (f)
226 :"memory", "cc"); 226 :"memory", "cc");
227} 227}
228 228
229static fastcall void native_irq_disable(void) 229static void native_irq_disable(void)
230{ 230{
231 asm volatile("cli": : :"memory"); 231 asm volatile("cli": : :"memory");
232} 232}
233 233
234static fastcall void native_irq_enable(void) 234static void native_irq_enable(void)
235{ 235{
236 asm volatile("sti": : :"memory"); 236 asm volatile("sti": : :"memory");
237} 237}
238 238
239static fastcall void native_safe_halt(void) 239static void native_safe_halt(void)
240{ 240{
241 asm volatile("sti; hlt": : :"memory"); 241 asm volatile("sti; hlt": : :"memory");
242} 242}
243 243
244static fastcall void native_halt(void) 244static void native_halt(void)
245{ 245{
246 asm volatile("hlt": : :"memory"); 246 asm volatile("hlt": : :"memory");
247} 247}
248 248
249static fastcall void native_wbinvd(void) 249static void native_wbinvd(void)
250{ 250{
251 asm volatile("wbinvd": : :"memory"); 251 asm volatile("wbinvd": : :"memory");
252} 252}
253 253
254static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) 254static unsigned long long native_read_msr(unsigned int msr, int *err)
255{ 255{
256 unsigned long long val; 256 unsigned long long val;
257 257
@@ -270,7 +270,7 @@ static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
270 return val; 270 return val;
271} 271}
272 272
273static fastcall int native_write_msr(unsigned int msr, unsigned long long val) 273static int native_write_msr(unsigned int msr, unsigned long long val)
274{ 274{
275 int err; 275 int err;
276 asm volatile("2: wrmsr ; xorl %0,%0\n" 276 asm volatile("2: wrmsr ; xorl %0,%0\n"
@@ -288,53 +288,53 @@ static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
288 return err; 288 return err;
289} 289}
290 290
291static fastcall unsigned long long native_read_tsc(void) 291static unsigned long long native_read_tsc(void)
292{ 292{
293 unsigned long long val; 293 unsigned long long val;
294 asm volatile("rdtsc" : "=A" (val)); 294 asm volatile("rdtsc" : "=A" (val));
295 return val; 295 return val;
296} 296}
297 297
298static fastcall unsigned long long native_read_pmc(void) 298static unsigned long long native_read_pmc(void)
299{ 299{
300 unsigned long long val; 300 unsigned long long val;
301 asm volatile("rdpmc" : "=A" (val)); 301 asm volatile("rdpmc" : "=A" (val));
302 return val; 302 return val;
303} 303}
304 304
305static fastcall void native_load_tr_desc(void) 305static void native_load_tr_desc(void)
306{ 306{
307 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); 307 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
308} 308}
309 309
310static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr) 310static void native_load_gdt(const struct Xgt_desc_struct *dtr)
311{ 311{
312 asm volatile("lgdt %0"::"m" (*dtr)); 312 asm volatile("lgdt %0"::"m" (*dtr));
313} 313}
314 314
315static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr) 315static void native_load_idt(const struct Xgt_desc_struct *dtr)
316{ 316{
317 asm volatile("lidt %0"::"m" (*dtr)); 317 asm volatile("lidt %0"::"m" (*dtr));
318} 318}
319 319
320static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) 320static void native_store_gdt(struct Xgt_desc_struct *dtr)
321{ 321{
322 asm ("sgdt %0":"=m" (*dtr)); 322 asm ("sgdt %0":"=m" (*dtr));
323} 323}
324 324
325static fastcall void native_store_idt(struct Xgt_desc_struct *dtr) 325static void native_store_idt(struct Xgt_desc_struct *dtr)
326{ 326{
327 asm ("sidt %0":"=m" (*dtr)); 327 asm ("sidt %0":"=m" (*dtr));
328} 328}
329 329
330static fastcall unsigned long native_store_tr(void) 330static unsigned long native_store_tr(void)
331{ 331{
332 unsigned long tr; 332 unsigned long tr;
333 asm ("str %0":"=r" (tr)); 333 asm ("str %0":"=r" (tr));
334 return tr; 334 return tr;
335} 335}
336 336
337static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu) 337static void native_load_tls(struct thread_struct *t, unsigned int cpu)
338{ 338{
339#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] 339#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
340 C(0); C(1); C(2); 340 C(0); C(1); C(2);
@@ -348,22 +348,22 @@ static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32
348 lp[1] = entry_high; 348 lp[1] = entry_high;
349} 349}
350 350
351static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) 351static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
352{ 352{
353 native_write_dt_entry(dt, entrynum, low, high); 353 native_write_dt_entry(dt, entrynum, low, high);
354} 354}
355 355
356static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) 356static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
357{ 357{
358 native_write_dt_entry(dt, entrynum, low, high); 358 native_write_dt_entry(dt, entrynum, low, high);
359} 359}
360 360
361static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) 361static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
362{ 362{
363 native_write_dt_entry(dt, entrynum, low, high); 363 native_write_dt_entry(dt, entrynum, low, high);
364} 364}
365 365
366static fastcall void native_load_esp0(struct tss_struct *tss, 366static void native_load_esp0(struct tss_struct *tss,
367 struct thread_struct *thread) 367 struct thread_struct *thread)
368{ 368{
369 tss->esp0 = thread->esp0; 369 tss->esp0 = thread->esp0;
@@ -375,12 +375,12 @@ static fastcall void native_load_esp0(struct tss_struct *tss,
375 } 375 }
376} 376}
377 377
378static fastcall void native_io_delay(void) 378static void native_io_delay(void)
379{ 379{
380 asm volatile("outb %al,$0x80"); 380 asm volatile("outb %al,$0x80");
381} 381}
382 382
383static fastcall void native_flush_tlb(void) 383static void native_flush_tlb(void)
384{ 384{
385 __native_flush_tlb(); 385 __native_flush_tlb();
386} 386}
@@ -389,49 +389,49 @@ static fastcall void native_flush_tlb(void)
389 * Global pages have to be flushed a bit differently. Not a real 389 * Global pages have to be flushed a bit differently. Not a real
390 * performance problem because this does not happen often. 390 * performance problem because this does not happen often.
391 */ 391 */
392static fastcall void native_flush_tlb_global(void) 392static void native_flush_tlb_global(void)
393{ 393{
394 __native_flush_tlb_global(); 394 __native_flush_tlb_global();
395} 395}
396 396
397static fastcall void native_flush_tlb_single(u32 addr) 397static void native_flush_tlb_single(u32 addr)
398{ 398{
399 __native_flush_tlb_single(addr); 399 __native_flush_tlb_single(addr);
400} 400}
401 401
402#ifndef CONFIG_X86_PAE 402#ifndef CONFIG_X86_PAE
403static fastcall void native_set_pte(pte_t *ptep, pte_t pteval) 403static void native_set_pte(pte_t *ptep, pte_t pteval)
404{ 404{
405 *ptep = pteval; 405 *ptep = pteval;
406} 406}
407 407
408static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) 408static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
409{ 409{
410 *ptep = pteval; 410 *ptep = pteval;
411} 411}
412 412
413static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) 413static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
414{ 414{
415 *pmdp = pmdval; 415 *pmdp = pmdval;
416} 416}
417 417
418#else /* CONFIG_X86_PAE */ 418#else /* CONFIG_X86_PAE */
419 419
420static fastcall void native_set_pte(pte_t *ptep, pte_t pte) 420static void native_set_pte(pte_t *ptep, pte_t pte)
421{ 421{
422 ptep->pte_high = pte.pte_high; 422 ptep->pte_high = pte.pte_high;
423 smp_wmb(); 423 smp_wmb();
424 ptep->pte_low = pte.pte_low; 424 ptep->pte_low = pte.pte_low;
425} 425}
426 426
427static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) 427static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
428{ 428{
429 ptep->pte_high = pte.pte_high; 429 ptep->pte_high = pte.pte_high;
430 smp_wmb(); 430 smp_wmb();
431 ptep->pte_low = pte.pte_low; 431 ptep->pte_low = pte.pte_low;
432} 432}
433 433
434static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) 434static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
435{ 435{
436 ptep->pte_low = 0; 436 ptep->pte_low = 0;
437 smp_wmb(); 437 smp_wmb();
@@ -440,29 +440,29 @@ static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long
440 ptep->pte_low = pte.pte_low; 440 ptep->pte_low = pte.pte_low;
441} 441}
442 442
443static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval) 443static void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
444{ 444{
445 set_64bit((unsigned long long *)ptep,pte_val(pteval)); 445 set_64bit((unsigned long long *)ptep,pte_val(pteval));
446} 446}
447 447
448static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) 448static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
449{ 449{
450 set_64bit((unsigned long long *)pmdp,pmd_val(pmdval)); 450 set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
451} 451}
452 452
453static fastcall void native_set_pud(pud_t *pudp, pud_t pudval) 453static void native_set_pud(pud_t *pudp, pud_t pudval)
454{ 454{
455 *pudp = pudval; 455 *pudp = pudval;
456} 456}
457 457
458static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 458static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
459{ 459{
460 ptep->pte_low = 0; 460 ptep->pte_low = 0;
461 smp_wmb(); 461 smp_wmb();
462 ptep->pte_high = 0; 462 ptep->pte_high = 0;
463} 463}
464 464
465static fastcall void native_pmd_clear(pmd_t *pmd) 465static void native_pmd_clear(pmd_t *pmd)
466{ 466{
467 u32 *tmp = (u32 *)pmd; 467 u32 *tmp = (u32 *)pmd;
468 *tmp = 0; 468 *tmp = 0;
@@ -472,8 +472,8 @@ static fastcall void native_pmd_clear(pmd_t *pmd)
472#endif /* CONFIG_X86_PAE */ 472#endif /* CONFIG_X86_PAE */
473 473
474/* These are in entry.S */ 474/* These are in entry.S */
475extern fastcall void native_iret(void); 475extern void native_iret(void);
476extern fastcall void native_irq_enable_sysexit(void); 476extern void native_irq_enable_sysexit(void);
477 477
478static int __init print_banner(void) 478static int __init print_banner(void)
479{ 479{
@@ -482,9 +482,6 @@ static int __init print_banner(void)
482} 482}
483core_initcall(print_banner); 483core_initcall(print_banner);
484 484
485/* We simply declare start_kernel to be the paravirt probe of last resort. */
486paravirt_probe(start_kernel);
487
488struct paravirt_ops paravirt_ops = { 485struct paravirt_ops paravirt_ops = {
489 .name = "bare hardware", 486 .name = "bare hardware",
490 .paravirt_enabled = 0, 487 .paravirt_enabled = 0,
@@ -544,12 +541,21 @@ struct paravirt_ops paravirt_ops = {
544 .apic_write = native_apic_write, 541 .apic_write = native_apic_write,
545 .apic_write_atomic = native_apic_write_atomic, 542 .apic_write_atomic = native_apic_write_atomic,
546 .apic_read = native_apic_read, 543 .apic_read = native_apic_read,
544 .setup_boot_clock = setup_boot_APIC_clock,
545 .setup_secondary_clock = setup_secondary_APIC_clock,
547#endif 546#endif
547 .set_lazy_mode = (void *)native_nop,
548 548
549 .flush_tlb_user = native_flush_tlb, 549 .flush_tlb_user = native_flush_tlb,
550 .flush_tlb_kernel = native_flush_tlb_global, 550 .flush_tlb_kernel = native_flush_tlb_global,
551 .flush_tlb_single = native_flush_tlb_single, 551 .flush_tlb_single = native_flush_tlb_single,
552 552
553 .alloc_pt = (void *)native_nop,
554 .alloc_pd = (void *)native_nop,
555 .alloc_pd_clone = (void *)native_nop,
556 .release_pt = (void *)native_nop,
557 .release_pd = (void *)native_nop,
558
553 .set_pte = native_set_pte, 559 .set_pte = native_set_pte,
554 .set_pte_at = native_set_pte_at, 560 .set_pte_at = native_set_pte_at,
555 .set_pmd = native_set_pmd, 561 .set_pmd = native_set_pmd,
@@ -565,6 +571,8 @@ struct paravirt_ops paravirt_ops = {
565 571
566 .irq_enable_sysexit = native_irq_enable_sysexit, 572 .irq_enable_sysexit = native_irq_enable_sysexit,
567 .iret = native_iret, 573 .iret = native_iret,
574
575 .startup_ipi_hook = (void *)native_nop,
568}; 576};
569 577
570/* 578/*
diff --git a/arch/i386/kernel/pcspeaker.c b/arch/i386/kernel/pcspeaker.c
new file mode 100644
index 000000000000..bc1f2d3ea277
--- /dev/null
+++ b/arch/i386/kernel/pcspeaker.c
@@ -0,0 +1,20 @@
1#include <linux/platform_device.h>
2#include <linux/errno.h>
3#include <linux/init.h>
4
5static __init int add_pcspkr(void)
6{
7 struct platform_device *pd;
8 int ret;
9
10 pd = platform_device_alloc("pcspkr", -1);
11 if (!pd)
12 return -ENOMEM;
13
14 ret = platform_device_add(pd);
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19}
20device_initcall(add_pcspkr);
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index c641056233a6..7845d480c293 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -48,6 +48,7 @@
48#include <asm/i387.h> 48#include <asm/i387.h>
49#include <asm/desc.h> 49#include <asm/desc.h>
50#include <asm/vm86.h> 50#include <asm/vm86.h>
51#include <asm/idle.h>
51#ifdef CONFIG_MATH_EMULATION 52#ifdef CONFIG_MATH_EMULATION
52#include <asm/math_emu.h> 53#include <asm/math_emu.h>
53#endif 54#endif
@@ -80,6 +81,42 @@ void (*pm_idle)(void);
80EXPORT_SYMBOL(pm_idle); 81EXPORT_SYMBOL(pm_idle);
81static DEFINE_PER_CPU(unsigned int, cpu_idle_state); 82static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
82 83
84static ATOMIC_NOTIFIER_HEAD(idle_notifier);
85
86void idle_notifier_register(struct notifier_block *n)
87{
88 atomic_notifier_chain_register(&idle_notifier, n);
89}
90
91void idle_notifier_unregister(struct notifier_block *n)
92{
93 atomic_notifier_chain_unregister(&idle_notifier, n);
94}
95
96static DEFINE_PER_CPU(volatile unsigned long, idle_state);
97
98void enter_idle(void)
99{
100 /* needs to be atomic w.r.t. interrupts, not against other CPUs */
101 __set_bit(0, &__get_cpu_var(idle_state));
102 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
103}
104
105static void __exit_idle(void)
106{
107 /* needs to be atomic w.r.t. interrupts, not against other CPUs */
108 if (__test_and_clear_bit(0, &__get_cpu_var(idle_state)) == 0)
109 return;
110 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
111}
112
113void exit_idle(void)
114{
115 if (current->pid)
116 return;
117 __exit_idle();
118}
119
83void disable_hlt(void) 120void disable_hlt(void)
84{ 121{
85 hlt_counter++; 122 hlt_counter++;
@@ -130,6 +167,7 @@ EXPORT_SYMBOL(default_idle);
130 */ 167 */
131static void poll_idle (void) 168static void poll_idle (void)
132{ 169{
170 local_irq_enable();
133 cpu_relax(); 171 cpu_relax();
134} 172}
135 173
@@ -189,7 +227,16 @@ void cpu_idle(void)
189 play_dead(); 227 play_dead();
190 228
191 __get_cpu_var(irq_stat).idle_timestamp = jiffies; 229 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
230
231 /*
232 * Idle routines should keep interrupts disabled
233 * from here on, until they go to idle.
234 * Otherwise, idle callbacks can misfire.
235 */
236 local_irq_disable();
237 enter_idle();
192 idle(); 238 idle();
239 __exit_idle();
193 } 240 }
194 preempt_enable_no_resched(); 241 preempt_enable_no_resched();
195 schedule(); 242 schedule();
@@ -243,7 +290,11 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
243 __monitor((void *)&current_thread_info()->flags, 0, 0); 290 __monitor((void *)&current_thread_info()->flags, 0, 0);
244 smp_mb(); 291 smp_mb();
245 if (!need_resched()) 292 if (!need_resched())
246 __mwait(eax, ecx); 293 __sti_mwait(eax, ecx);
294 else
295 local_irq_enable();
296 } else {
297 local_irq_enable();
247 } 298 }
248} 299}
249 300
@@ -308,8 +359,8 @@ void show_regs(struct pt_regs * regs)
308 regs->eax,regs->ebx,regs->ecx,regs->edx); 359 regs->eax,regs->ebx,regs->ecx,regs->edx);
309 printk("ESI: %08lx EDI: %08lx EBP: %08lx", 360 printk("ESI: %08lx EDI: %08lx EBP: %08lx",
310 regs->esi, regs->edi, regs->ebp); 361 regs->esi, regs->edi, regs->ebp);
311 printk(" DS: %04x ES: %04x GS: %04x\n", 362 printk(" DS: %04x ES: %04x FS: %04x\n",
312 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xgs); 363 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs);
313 364
314 cr0 = read_cr0(); 365 cr0 = read_cr0();
315 cr2 = read_cr2(); 366 cr2 = read_cr2();
@@ -340,7 +391,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
340 391
341 regs.xds = __USER_DS; 392 regs.xds = __USER_DS;
342 regs.xes = __USER_DS; 393 regs.xes = __USER_DS;
343 regs.xgs = __KERNEL_PDA; 394 regs.xfs = __KERNEL_PDA;
344 regs.orig_eax = -1; 395 regs.orig_eax = -1;
345 regs.eip = (unsigned long) kernel_thread_helper; 396 regs.eip = (unsigned long) kernel_thread_helper;
346 regs.xcs = __KERNEL_CS | get_kernel_rpl(); 397 regs.xcs = __KERNEL_CS | get_kernel_rpl();
@@ -425,7 +476,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
425 476
426 p->thread.eip = (unsigned long) ret_from_fork; 477 p->thread.eip = (unsigned long) ret_from_fork;
427 478
428 savesegment(fs,p->thread.fs); 479 savesegment(gs,p->thread.gs);
429 480
430 tsk = current; 481 tsk = current;
431 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 482 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
@@ -501,8 +552,8 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
501 dump->regs.eax = regs->eax; 552 dump->regs.eax = regs->eax;
502 dump->regs.ds = regs->xds; 553 dump->regs.ds = regs->xds;
503 dump->regs.es = regs->xes; 554 dump->regs.es = regs->xes;
504 savesegment(fs,dump->regs.fs); 555 dump->regs.fs = regs->xfs;
505 dump->regs.gs = regs->xgs; 556 savesegment(gs,dump->regs.gs);
506 dump->regs.orig_eax = regs->orig_eax; 557 dump->regs.orig_eax = regs->orig_eax;
507 dump->regs.eip = regs->eip; 558 dump->regs.eip = regs->eip;
508 dump->regs.cs = regs->xcs; 559 dump->regs.cs = regs->xcs;
@@ -653,7 +704,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
653 load_esp0(tss, next); 704 load_esp0(tss, next);
654 705
655 /* 706 /*
656 * Save away %fs. No need to save %gs, as it was saved on the 707 * Save away %gs. No need to save %fs, as it was saved on the
657 * stack on entry. No need to save %es and %ds, as those are 708 * stack on entry. No need to save %es and %ds, as those are
658 * always kernel segments while inside the kernel. Doing this 709 * always kernel segments while inside the kernel. Doing this
659 * before setting the new TLS descriptors avoids the situation 710 * before setting the new TLS descriptors avoids the situation
@@ -662,7 +713,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
662 * used %fs or %gs (it does not today), or if the kernel is 713 * used %fs or %gs (it does not today), or if the kernel is
663 * running inside of a hypervisor layer. 714 * running inside of a hypervisor layer.
664 */ 715 */
665 savesegment(fs, prev->fs); 716 savesegment(gs, prev->gs);
666 717
667 /* 718 /*
668 * Load the per-thread Thread-Local Storage descriptor. 719 * Load the per-thread Thread-Local Storage descriptor.
@@ -670,14 +721,13 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
670 load_TLS(next, cpu); 721 load_TLS(next, cpu);
671 722
672 /* 723 /*
673 * Restore %fs if needed. 724 * Restore IOPL if needed. In normal use, the flags restore
674 * 725 * in the switch assembly will handle this. But if the kernel
675 * Glibc normally makes %fs be zero. 726 * is running virtualized at a non-zero CPL, the popf will
727 * not restore flags, so it must be done in a separate step.
676 */ 728 */
677 if (unlikely(prev->fs | next->fs)) 729 if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
678 loadsegment(fs, next->fs); 730 set_iopl_mask(next->iopl);
679
680 write_pda(pcurrent, next_p);
681 731
682 /* 732 /*
683 * Now maybe handle debug registers and/or IO bitmaps 733 * Now maybe handle debug registers and/or IO bitmaps
@@ -688,6 +738,15 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
688 738
689 disable_tsc(prev_p, next_p); 739 disable_tsc(prev_p, next_p);
690 740
741 /*
742 * Leave lazy mode, flushing any hypercalls made here.
743 * This must be done before restoring TLS segments so
744 * the GDT and LDT are properly updated, and must be
745 * done before math_state_restore, so the TS bit is up
746 * to date.
747 */
748 arch_leave_lazy_cpu_mode();
749
691 /* If the task has used fpu the last 5 timeslices, just do a full 750 /* If the task has used fpu the last 5 timeslices, just do a full
692 * restore of the math state immediately to avoid the trap; the 751 * restore of the math state immediately to avoid the trap; the
693 * chances of needing FPU soon are obviously high now 752 * chances of needing FPU soon are obviously high now
@@ -695,6 +754,14 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
695 if (next_p->fpu_counter > 5) 754 if (next_p->fpu_counter > 5)
696 math_state_restore(); 755 math_state_restore();
697 756
757 /*
758 * Restore %gs if needed (which is common)
759 */
760 if (prev->gs | next->gs)
761 loadsegment(gs, next->gs);
762
763 write_pda(pcurrent, next_p);
764
698 return prev_p; 765 return prev_p;
699} 766}
700 767
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index af8aabe85800..4a8f8a259723 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -89,14 +89,14 @@ static int putreg(struct task_struct *child,
89 unsigned long regno, unsigned long value) 89 unsigned long regno, unsigned long value)
90{ 90{
91 switch (regno >> 2) { 91 switch (regno >> 2) {
92 case FS: 92 case GS:
93 if (value && (value & 3) != 3) 93 if (value && (value & 3) != 3)
94 return -EIO; 94 return -EIO;
95 child->thread.fs = value; 95 child->thread.gs = value;
96 return 0; 96 return 0;
97 case DS: 97 case DS:
98 case ES: 98 case ES:
99 case GS: 99 case FS:
100 if (value && (value & 3) != 3) 100 if (value && (value & 3) != 3)
101 return -EIO; 101 return -EIO;
102 value &= 0xffff; 102 value &= 0xffff;
@@ -112,7 +112,7 @@ static int putreg(struct task_struct *child,
112 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; 112 value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK;
113 break; 113 break;
114 } 114 }
115 if (regno > ES*4) 115 if (regno > FS*4)
116 regno -= 1*4; 116 regno -= 1*4;
117 put_stack_long(child, regno, value); 117 put_stack_long(child, regno, value);
118 return 0; 118 return 0;
@@ -124,18 +124,18 @@ static unsigned long getreg(struct task_struct *child,
124 unsigned long retval = ~0UL; 124 unsigned long retval = ~0UL;
125 125
126 switch (regno >> 2) { 126 switch (regno >> 2) {
127 case FS: 127 case GS:
128 retval = child->thread.fs; 128 retval = child->thread.gs;
129 break; 129 break;
130 case DS: 130 case DS:
131 case ES: 131 case ES:
132 case GS: 132 case FS:
133 case SS: 133 case SS:
134 case CS: 134 case CS:
135 retval = 0xffff; 135 retval = 0xffff;
136 /* fall through */ 136 /* fall through */
137 default: 137 default:
138 if (regno > ES*4) 138 if (regno > FS*4)
139 regno -= 1*4; 139 regno -= 1*4;
140 retval &= get_stack_long(child, regno); 140 retval &= get_stack_long(child, regno);
141 } 141 }
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 4694ac980cd2..122623dcc6e1 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -33,7 +33,6 @@
33#include <linux/initrd.h> 33#include <linux/initrd.h>
34#include <linux/bootmem.h> 34#include <linux/bootmem.h>
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/platform_device.h>
37#include <linux/console.h> 36#include <linux/console.h>
38#include <linux/mca.h> 37#include <linux/mca.h>
39#include <linux/root_dev.h> 38#include <linux/root_dev.h>
@@ -60,6 +59,7 @@
60#include <asm/io_apic.h> 59#include <asm/io_apic.h>
61#include <asm/ist.h> 60#include <asm/ist.h>
62#include <asm/io.h> 61#include <asm/io.h>
62#include <asm/vmi.h>
63#include <setup_arch.h> 63#include <setup_arch.h>
64#include <bios_ebda.h> 64#include <bios_ebda.h>
65 65
@@ -581,6 +581,14 @@ void __init setup_arch(char **cmdline_p)
581 581
582 max_low_pfn = setup_memory(); 582 max_low_pfn = setup_memory();
583 583
584#ifdef CONFIG_VMI
585 /*
586 * Must be after max_low_pfn is determined, and before kernel
587 * pagetables are setup.
588 */
589 vmi_init();
590#endif
591
584 /* 592 /*
585 * NOTE: before this point _nobody_ is allowed to allocate 593 * NOTE: before this point _nobody_ is allowed to allocate
586 * any memory using the bootmem allocator. Although the 594 * any memory using the bootmem allocator. Although the
@@ -651,28 +659,3 @@ void __init setup_arch(char **cmdline_p)
651#endif 659#endif
652 tsc_init(); 660 tsc_init();
653} 661}
654
655static __init int add_pcspkr(void)
656{
657 struct platform_device *pd;
658 int ret;
659
660 pd = platform_device_alloc("pcspkr", -1);
661 if (!pd)
662 return -ENOMEM;
663
664 ret = platform_device_add(pd);
665 if (ret)
666 platform_device_put(pd);
667
668 return ret;
669}
670device_initcall(add_pcspkr);
671
672/*
673 * Local Variables:
674 * mode:c
675 * c-file-style:"k&r"
676 * c-basic-offset:8
677 * End:
678 */
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 65d7620eaa09..4f99e870c986 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -21,6 +21,7 @@
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/elf.h> 23#include <linux/elf.h>
24#include <linux/binfmts.h>
24#include <asm/processor.h> 25#include <asm/processor.h>
25#include <asm/ucontext.h> 26#include <asm/ucontext.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
@@ -128,8 +129,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
128 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ 129 X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
129 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) 130 X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
130 131
131 COPY_SEG(gs); 132 GET_SEG(gs);
132 GET_SEG(fs); 133 COPY_SEG(fs);
133 COPY_SEG(es); 134 COPY_SEG(es);
134 COPY_SEG(ds); 135 COPY_SEG(ds);
135 COPY(edi); 136 COPY(edi);
@@ -244,9 +245,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
244{ 245{
245 int tmp, err = 0; 246 int tmp, err = 0;
246 247
247 err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs); 248 err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs);
248 savesegment(fs, tmp); 249 savesegment(gs, tmp);
249 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 250 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
250 251
251 err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); 252 err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
252 err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); 253 err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
@@ -349,7 +350,10 @@ static int setup_frame(int sig, struct k_sigaction *ka,
349 goto give_sigsegv; 350 goto give_sigsegv;
350 } 351 }
351 352
352 restorer = (void *)VDSO_SYM(&__kernel_sigreturn); 353 if (current->binfmt->hasvdso)
354 restorer = (void *)VDSO_SYM(&__kernel_sigreturn);
355 else
356 restorer = (void *)&frame->retcode;
353 if (ka->sa.sa_flags & SA_RESTORER) 357 if (ka->sa.sa_flags & SA_RESTORER)
354 restorer = ka->sa.sa_restorer; 358 restorer = ka->sa.sa_restorer;
355 359
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index 5285aff8367f..9bd9637ae692 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -23,6 +23,7 @@
23 23
24#include <asm/mtrr.h> 24#include <asm/mtrr.h>
25#include <asm/tlbflush.h> 25#include <asm/tlbflush.h>
26#include <asm/idle.h>
26#include <mach_apic.h> 27#include <mach_apic.h>
27 28
28/* 29/*
@@ -374,8 +375,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
374 /* 375 /*
375 * i'm not happy about this global shared spinlock in the 376 * i'm not happy about this global shared spinlock in the
376 * MM hot path, but we'll see how contended it is. 377 * MM hot path, but we'll see how contended it is.
377 * Temporarily this turns IRQs off, so that lockups are 378 * AK: x86-64 has a faster method that could be ported.
378 * detected by the NMI watchdog.
379 */ 379 */
380 spin_lock(&tlbstate_lock); 380 spin_lock(&tlbstate_lock);
381 381
@@ -400,7 +400,7 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
400 400
401 while (!cpus_empty(flush_cpumask)) 401 while (!cpus_empty(flush_cpumask))
402 /* nothing. lockup detection does not belong here */ 402 /* nothing. lockup detection does not belong here */
403 mb(); 403 cpu_relax();
404 404
405 flush_mm = NULL; 405 flush_mm = NULL;
406 flush_va = 0; 406 flush_va = 0;
@@ -624,6 +624,7 @@ fastcall void smp_call_function_interrupt(struct pt_regs *regs)
624 /* 624 /*
625 * At this point the info structure may be out of scope unless wait==1 625 * At this point the info structure may be out of scope unless wait==1
626 */ 626 */
627 exit_idle();
627 irq_enter(); 628 irq_enter();
628 (*func)(info); 629 (*func)(info);
629 irq_exit(); 630 irq_exit();
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 8c6c8c52b95c..f46a4d095e6c 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -63,6 +63,7 @@
63#include <mach_apic.h> 63#include <mach_apic.h>
64#include <mach_wakecpu.h> 64#include <mach_wakecpu.h>
65#include <smpboot_hooks.h> 65#include <smpboot_hooks.h>
66#include <asm/vmi.h>
66 67
67/* Set if we find a B stepping CPU */ 68/* Set if we find a B stepping CPU */
68static int __devinitdata smp_b_stepping; 69static int __devinitdata smp_b_stepping;
@@ -545,12 +546,15 @@ static void __cpuinit start_secondary(void *unused)
545 * booting is too fragile that we want to limit the 546 * booting is too fragile that we want to limit the
546 * things done here to the most necessary things. 547 * things done here to the most necessary things.
547 */ 548 */
549#ifdef CONFIG_VMI
550 vmi_bringup();
551#endif
548 secondary_cpu_init(); 552 secondary_cpu_init();
549 preempt_disable(); 553 preempt_disable();
550 smp_callin(); 554 smp_callin();
551 while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) 555 while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
552 rep_nop(); 556 rep_nop();
553 setup_secondary_APIC_clock(); 557 setup_secondary_clock();
554 if (nmi_watchdog == NMI_IO_APIC) { 558 if (nmi_watchdog == NMI_IO_APIC) {
555 disable_8259A_irq(0); 559 disable_8259A_irq(0);
556 enable_NMI_through_LVT0(NULL); 560 enable_NMI_through_LVT0(NULL);
@@ -619,7 +623,6 @@ extern struct {
619 unsigned short ss; 623 unsigned short ss;
620} stack_start; 624} stack_start;
621extern struct i386_pda *start_pda; 625extern struct i386_pda *start_pda;
622extern struct Xgt_desc_struct cpu_gdt_descr;
623 626
624#ifdef CONFIG_NUMA 627#ifdef CONFIG_NUMA
625 628
@@ -835,6 +838,13 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
835 num_starts = 0; 838 num_starts = 0;
836 839
837 /* 840 /*
841 * Paravirt / VMI wants a startup IPI hook here to set up the
842 * target processor state.
843 */
844 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
845 (unsigned long) stack_start.esp);
846
847 /*
838 * Run STARTUP IPI loop. 848 * Run STARTUP IPI loop.
839 */ 849 */
840 Dprintk("#startup loops: %d.\n", num_starts); 850 Dprintk("#startup loops: %d.\n", num_starts);
@@ -1320,7 +1330,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
1320 1330
1321 smpboot_setup_io_apic(); 1331 smpboot_setup_io_apic();
1322 1332
1323 setup_boot_APIC_clock(); 1333 setup_boot_clock();
1324 1334
1325 /* 1335 /*
1326 * Synchronize the TSC with the AP 1336 * Synchronize the TSC with the AP
diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c
index bc882a2b1db6..13ca54a85a1c 100644
--- a/arch/i386/kernel/sysenter.c
+++ b/arch/i386/kernel/sysenter.c
@@ -78,7 +78,7 @@ int __init sysenter_setup(void)
78 syscall_pages[0] = virt_to_page(syscall_page); 78 syscall_pages[0] = virt_to_page(syscall_page);
79 79
80#ifdef CONFIG_COMPAT_VDSO 80#ifdef CONFIG_COMPAT_VDSO
81 __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY); 81 __set_fixmap(FIX_VDSO, __pa(syscall_page), PAGE_READONLY_EXEC);
82 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); 82 printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO));
83#endif 83#endif
84 84
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index c505b16c0990..a4f67a6e6821 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -131,15 +131,13 @@ unsigned long profile_pc(struct pt_regs *regs)
131 unsigned long pc = instruction_pointer(regs); 131 unsigned long pc = instruction_pointer(regs);
132 132
133#ifdef CONFIG_SMP 133#ifdef CONFIG_SMP
134 if (!user_mode_vm(regs) && in_lock_functions(pc)) { 134 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) &&
135 in_lock_functions(pc)) {
135#ifdef CONFIG_FRAME_POINTER 136#ifdef CONFIG_FRAME_POINTER
136 return *(unsigned long *)(regs->ebp + 4); 137 return *(unsigned long *)(regs->ebp + 4);
137#else 138#else
138 unsigned long *sp; 139 unsigned long *sp = (unsigned long *)&regs->esp;
139 if ((regs->xcs & 3) == 0) 140
140 sp = (unsigned long *)&regs->esp;
141 else
142 sp = (unsigned long *)regs->esp;
143 /* Return address is either directly at stack pointer 141 /* Return address is either directly at stack pointer
144 or above a saved eflags. Eflags has bits 22-31 zero, 142 or above a saved eflags. Eflags has bits 22-31 zero,
145 kernel addresses don't. */ 143 kernel addresses don't. */
@@ -232,6 +230,7 @@ EXPORT_SYMBOL(get_cmos_time);
232static void sync_cmos_clock(unsigned long dummy); 230static void sync_cmos_clock(unsigned long dummy);
233 231
234static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); 232static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
233int no_sync_cmos_clock;
235 234
236static void sync_cmos_clock(unsigned long dummy) 235static void sync_cmos_clock(unsigned long dummy)
237{ 236{
@@ -275,7 +274,8 @@ static void sync_cmos_clock(unsigned long dummy)
275 274
276void notify_arch_cmos_timer(void) 275void notify_arch_cmos_timer(void)
277{ 276{
278 mod_timer(&sync_cmos_timer, jiffies + 1); 277 if (!no_sync_cmos_clock)
278 mod_timer(&sync_cmos_timer, jiffies + 1);
279} 279}
280 280
281static long clock_cmos_diff; 281static long clock_cmos_diff;
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 0efad8aeb41a..af0d3f70a817 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -94,6 +94,7 @@ asmlinkage void spurious_interrupt_bug(void);
94asmlinkage void machine_check(void); 94asmlinkage void machine_check(void);
95 95
96int kstack_depth_to_print = 24; 96int kstack_depth_to_print = 24;
97static unsigned int code_bytes = 64;
97ATOMIC_NOTIFIER_HEAD(i386die_chain); 98ATOMIC_NOTIFIER_HEAD(i386die_chain);
98 99
99int register_die_notifier(struct notifier_block *nb) 100int register_die_notifier(struct notifier_block *nb)
@@ -291,10 +292,11 @@ void show_registers(struct pt_regs *regs)
291 int i; 292 int i;
292 int in_kernel = 1; 293 int in_kernel = 1;
293 unsigned long esp; 294 unsigned long esp;
294 unsigned short ss; 295 unsigned short ss, gs;
295 296
296 esp = (unsigned long) (&regs->esp); 297 esp = (unsigned long) (&regs->esp);
297 savesegment(ss, ss); 298 savesegment(ss, ss);
299 savesegment(gs, gs);
298 if (user_mode_vm(regs)) { 300 if (user_mode_vm(regs)) {
299 in_kernel = 0; 301 in_kernel = 0;
300 esp = regs->esp; 302 esp = regs->esp;
@@ -313,8 +315,8 @@ void show_registers(struct pt_regs *regs)
313 regs->eax, regs->ebx, regs->ecx, regs->edx); 315 regs->eax, regs->ebx, regs->ecx, regs->edx);
314 printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", 316 printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
315 regs->esi, regs->edi, regs->ebp, esp); 317 regs->esi, regs->edi, regs->ebp, esp);
316 printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", 318 printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n",
317 regs->xds & 0xffff, regs->xes & 0xffff, ss); 319 regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss);
318 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", 320 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
319 TASK_COMM_LEN, current->comm, current->pid, 321 TASK_COMM_LEN, current->comm, current->pid,
320 current_thread_info(), current, current->thread_info); 322 current_thread_info(), current, current->thread_info);
@@ -324,7 +326,8 @@ void show_registers(struct pt_regs *regs)
324 */ 326 */
325 if (in_kernel) { 327 if (in_kernel) {
326 u8 *eip; 328 u8 *eip;
327 int code_bytes = 64; 329 unsigned int code_prologue = code_bytes * 43 / 64;
330 unsigned int code_len = code_bytes;
328 unsigned char c; 331 unsigned char c;
329 332
330 printk("\n" KERN_EMERG "Stack: "); 333 printk("\n" KERN_EMERG "Stack: ");
@@ -332,14 +335,14 @@ void show_registers(struct pt_regs *regs)
332 335
333 printk(KERN_EMERG "Code: "); 336 printk(KERN_EMERG "Code: ");
334 337
335 eip = (u8 *)regs->eip - 43; 338 eip = (u8 *)regs->eip - code_prologue;
336 if (eip < (u8 *)PAGE_OFFSET || 339 if (eip < (u8 *)PAGE_OFFSET ||
337 probe_kernel_address(eip, c)) { 340 probe_kernel_address(eip, c)) {
338 /* try starting at EIP */ 341 /* try starting at EIP */
339 eip = (u8 *)regs->eip; 342 eip = (u8 *)regs->eip;
340 code_bytes = 32; 343 code_len = code_len - code_prologue + 1;
341 } 344 }
342 for (i = 0; i < code_bytes; i++, eip++) { 345 for (i = 0; i < code_len; i++, eip++) {
343 if (eip < (u8 *)PAGE_OFFSET || 346 if (eip < (u8 *)PAGE_OFFSET ||
344 probe_kernel_address(eip, c)) { 347 probe_kernel_address(eip, c)) {
345 printk(" Bad EIP value."); 348 printk(" Bad EIP value.");
@@ -1191,3 +1194,13 @@ static int __init kstack_setup(char *s)
1191 return 1; 1194 return 1;
1192} 1195}
1193__setup("kstack=", kstack_setup); 1196__setup("kstack=", kstack_setup);
1197
1198static int __init code_bytes_setup(char *s)
1199{
1200 code_bytes = simple_strtoul(s, NULL, 0);
1201 if (code_bytes > 8192)
1202 code_bytes = 8192;
1203
1204 return 1;
1205}
1206__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index 2cfc7b09b925..46f752a8bbf3 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -23,6 +23,7 @@
23 * an extra value to store the TSC freq 23 * an extra value to store the TSC freq
24 */ 24 */
25unsigned int tsc_khz; 25unsigned int tsc_khz;
26unsigned long long (*custom_sched_clock)(void);
26 27
27int tsc_disable; 28int tsc_disable;
28 29
@@ -107,14 +108,14 @@ unsigned long long sched_clock(void)
107{ 108{
108 unsigned long long this_offset; 109 unsigned long long this_offset;
109 110
111 if (unlikely(custom_sched_clock))
112 return (*custom_sched_clock)();
113
110 /* 114 /*
111 * in the NUMA case we dont use the TSC as they are not 115 * Fall back to jiffies if there's no TSC available:
112 * synchronized across all CPUs.
113 */ 116 */
114#ifndef CONFIG_NUMA 117 if (unlikely(tsc_disable))
115 if (!cpu_khz || check_tsc_unstable()) 118 /* No locking but a rare wrong value is not a big deal: */
116#endif
117 /* no locking but a rare wrong value is not a big deal */
118 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); 119 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
119 120
120 /* read the Time Stamp Counter: */ 121 /* read the Time Stamp Counter: */
@@ -194,13 +195,13 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);
194void __init tsc_init(void) 195void __init tsc_init(void)
195{ 196{
196 if (!cpu_has_tsc || tsc_disable) 197 if (!cpu_has_tsc || tsc_disable)
197 return; 198 goto out_no_tsc;
198 199
199 cpu_khz = calculate_cpu_khz(); 200 cpu_khz = calculate_cpu_khz();
200 tsc_khz = cpu_khz; 201 tsc_khz = cpu_khz;
201 202
202 if (!cpu_khz) 203 if (!cpu_khz)
203 return; 204 goto out_no_tsc;
204 205
205 printk("Detected %lu.%03lu MHz processor.\n", 206 printk("Detected %lu.%03lu MHz processor.\n",
206 (unsigned long)cpu_khz / 1000, 207 (unsigned long)cpu_khz / 1000,
@@ -208,6 +209,15 @@ void __init tsc_init(void)
208 209
209 set_cyc2ns_scale(cpu_khz); 210 set_cyc2ns_scale(cpu_khz);
210 use_tsc_delay(); 211 use_tsc_delay();
212 return;
213
214out_no_tsc:
215 /*
216 * Set the tsc_disable flag if there's no TSC support, this
217 * makes it a fast flag for the kernel to see whether it
218 * should be using the TSC.
219 */
220 tsc_disable = 1;
211} 221}
212 222
213#ifdef CONFIG_CPU_FREQ 223#ifdef CONFIG_CPU_FREQ
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index be2f96e67f78..d1b8f2b7aea6 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -96,12 +96,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
96{ 96{
97 int ret = 0; 97 int ret = 0;
98 98
99 /* kernel_vm86_regs is missing xfs, so copy everything up to 99 /* kernel_vm86_regs is missing xgs, so copy everything up to
100 (but not including) xgs, and then rest after xgs. */ 100 (but not including) orig_eax, and then rest including orig_eax. */
101 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.xgs)); 101 ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax));
102 ret += copy_to_user(&user->__null_gs, &regs->pt.xgs, 102 ret += copy_to_user(&user->orig_eax, &regs->pt.orig_eax,
103 sizeof(struct kernel_vm86_regs) - 103 sizeof(struct kernel_vm86_regs) -
104 offsetof(struct kernel_vm86_regs, pt.xgs)); 104 offsetof(struct kernel_vm86_regs, pt.orig_eax));
105 105
106 return ret; 106 return ret;
107} 107}
@@ -113,12 +113,13 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
113{ 113{
114 int ret = 0; 114 int ret = 0;
115 115
116 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.xgs)); 116 /* copy eax-xfs inclusive */
117 ret += copy_from_user(&regs->pt.xgs, &user->__null_gs, 117 ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax));
118 /* copy orig_eax-__gsh+extra */
119 ret += copy_from_user(&regs->pt.orig_eax, &user->orig_eax,
118 sizeof(struct kernel_vm86_regs) - 120 sizeof(struct kernel_vm86_regs) -
119 offsetof(struct kernel_vm86_regs, pt.xgs) + 121 offsetof(struct kernel_vm86_regs, pt.orig_eax) +
120 extra); 122 extra);
121
122 return ret; 123 return ret;
123} 124}
124 125
@@ -157,8 +158,8 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs)
157 158
158 ret = KVM86->regs32; 159 ret = KVM86->regs32;
159 160
160 loadsegment(fs, current->thread.saved_fs); 161 ret->xfs = current->thread.saved_fs;
161 ret->xgs = current->thread.saved_gs; 162 loadsegment(gs, current->thread.saved_gs);
162 163
163 return ret; 164 return ret;
164} 165}
@@ -285,9 +286,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
285 */ 286 */
286 info->regs.pt.xds = 0; 287 info->regs.pt.xds = 0;
287 info->regs.pt.xes = 0; 288 info->regs.pt.xes = 0;
288 info->regs.pt.xgs = 0; 289 info->regs.pt.xfs = 0;
289 290
290/* we are clearing fs later just before "jmp resume_userspace", 291/* we are clearing gs later just before "jmp resume_userspace",
291 * because it is not saved/restored. 292 * because it is not saved/restored.
292 */ 293 */
293 294
@@ -321,8 +322,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
321 */ 322 */
322 info->regs32->eax = 0; 323 info->regs32->eax = 0;
323 tsk->thread.saved_esp0 = tsk->thread.esp0; 324 tsk->thread.saved_esp0 = tsk->thread.esp0;
324 savesegment(fs, tsk->thread.saved_fs); 325 tsk->thread.saved_fs = info->regs32->xfs;
325 tsk->thread.saved_gs = info->regs32->xgs; 326 savesegment(gs, tsk->thread.saved_gs);
326 327
327 tss = &per_cpu(init_tss, get_cpu()); 328 tss = &per_cpu(init_tss, get_cpu());
328 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; 329 tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
@@ -342,7 +343,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
342 __asm__ __volatile__( 343 __asm__ __volatile__(
343 "movl %0,%%esp\n\t" 344 "movl %0,%%esp\n\t"
344 "movl %1,%%ebp\n\t" 345 "movl %1,%%ebp\n\t"
345 "mov %2, %%fs\n\t" 346 "mov %2, %%gs\n\t"
346 "jmp resume_userspace" 347 "jmp resume_userspace"
347 : /* no outputs */ 348 : /* no outputs */
348 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); 349 :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
diff --git a/arch/i386/kernel/vmi.c b/arch/i386/kernel/vmi.c
new file mode 100644
index 000000000000..bb5a7abf949c
--- /dev/null
+++ b/arch/i386/kernel/vmi.c
@@ -0,0 +1,949 @@
1/*
2 * VMI specific paravirt-ops implementation
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to zach@vmware.com
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/license.h>
27#include <linux/cpu.h>
28#include <linux/bootmem.h>
29#include <linux/mm.h>
30#include <asm/vmi.h>
31#include <asm/io.h>
32#include <asm/fixmap.h>
33#include <asm/apicdef.h>
34#include <asm/apic.h>
35#include <asm/processor.h>
36#include <asm/timer.h>
37#include <asm/vmi_time.h>
38
39/* Convenient for calling VMI functions indirectly in the ROM */
40typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
41typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
42
43#define call_vrom_func(rom,func) \
44 (((VROMFUNC *)(rom->func))())
45
46#define call_vrom_long_func(rom,func,arg) \
47 (((VROMLONGFUNC *)(rom->func)) (arg))
48
49static struct vrom_header *vmi_rom;
50static int license_gplok;
51static int disable_nodelay;
52static int disable_pge;
53static int disable_pse;
54static int disable_sep;
55static int disable_tsc;
56static int disable_mtrr;
57
58/* Cached VMI operations */
59struct {
60 void (*cpuid)(void /* non-c */);
61 void (*_set_ldt)(u32 selector);
62 void (*set_tr)(u32 selector);
63 void (*set_kernel_stack)(u32 selector, u32 esp0);
64 void (*allocate_page)(u32, u32, u32, u32, u32);
65 void (*release_page)(u32, u32);
66 void (*set_pte)(pte_t, pte_t *, unsigned);
67 void (*update_pte)(pte_t *, unsigned);
68 void (*set_linear_mapping)(int, u32, u32, u32);
69 void (*flush_tlb)(int);
70 void (*set_initial_ap_state)(int, int);
71 void (*halt)(void);
72} vmi_ops;
73
74/* XXX move this to alternative.h */
75extern struct paravirt_patch __start_parainstructions[],
76 __stop_parainstructions[];
77
78/*
79 * VMI patching routines.
80 */
81#define MNEM_CALL 0xe8
82#define MNEM_JMP 0xe9
83#define MNEM_RET 0xc3
84
85static char irq_save_disable_callout[] = {
86 MNEM_CALL, 0, 0, 0, 0,
87 MNEM_CALL, 0, 0, 0, 0,
88 MNEM_RET
89};
90#define IRQ_PATCH_INT_MASK 0
91#define IRQ_PATCH_DISABLE 5
92
93static inline void patch_offset(unsigned char *eip, unsigned char *dest)
94{
95 *(unsigned long *)(eip+1) = dest-eip-5;
96}
97
98static unsigned patch_internal(int call, unsigned len, void *insns)
99{
100 u64 reloc;
101 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
102 reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
103 switch(rel->type) {
104 case VMI_RELOCATION_CALL_REL:
105 BUG_ON(len < 5);
106 *(char *)insns = MNEM_CALL;
107 patch_offset(insns, rel->eip);
108 return 5;
109
110 case VMI_RELOCATION_JUMP_REL:
111 BUG_ON(len < 5);
112 *(char *)insns = MNEM_JMP;
113 patch_offset(insns, rel->eip);
114 return 5;
115
116 case VMI_RELOCATION_NOP:
117 /* obliterate the whole thing */
118 return 0;
119
120 case VMI_RELOCATION_NONE:
121 /* leave native code in place */
122 break;
123
124 default:
125 BUG();
126 }
127 return len;
128}
129
130/*
131 * Apply patch if appropriate, return length of new instruction
132 * sequence. The callee does nop padding for us.
133 */
134static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, unsigned len)
135{
136 switch (type) {
137 case PARAVIRT_IRQ_DISABLE:
138 return patch_internal(VMI_CALL_DisableInterrupts, len, insns);
139 case PARAVIRT_IRQ_ENABLE:
140 return patch_internal(VMI_CALL_EnableInterrupts, len, insns);
141 case PARAVIRT_RESTORE_FLAGS:
142 return patch_internal(VMI_CALL_SetInterruptMask, len, insns);
143 case PARAVIRT_SAVE_FLAGS:
144 return patch_internal(VMI_CALL_GetInterruptMask, len, insns);
145 case PARAVIRT_SAVE_FLAGS_IRQ_DISABLE:
146 if (len >= 10) {
147 patch_internal(VMI_CALL_GetInterruptMask, len, insns);
148 patch_internal(VMI_CALL_DisableInterrupts, len-5, insns+5);
149 return 10;
150 } else {
151 /*
152 * You bastards didn't leave enough room to
153 * patch save_flags_irq_disable inline. Patch
154 * to a helper
155 */
156 BUG_ON(len < 5);
157 *(char *)insns = MNEM_CALL;
158 patch_offset(insns, irq_save_disable_callout);
159 return 5;
160 }
161 case PARAVIRT_INTERRUPT_RETURN:
162 return patch_internal(VMI_CALL_IRET, len, insns);
163 case PARAVIRT_STI_SYSEXIT:
164 return patch_internal(VMI_CALL_SYSEXIT, len, insns);
165 default:
166 break;
167 }
168 return len;
169}
170
171/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
172static void vmi_cpuid(unsigned int *eax, unsigned int *ebx,
173 unsigned int *ecx, unsigned int *edx)
174{
175 int override = 0;
176 if (*eax == 1)
177 override = 1;
178 asm volatile ("call *%6"
179 : "=a" (*eax),
180 "=b" (*ebx),
181 "=c" (*ecx),
182 "=d" (*edx)
183 : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid));
184 if (override) {
185 if (disable_pse)
186 *edx &= ~X86_FEATURE_PSE;
187 if (disable_pge)
188 *edx &= ~X86_FEATURE_PGE;
189 if (disable_sep)
190 *edx &= ~X86_FEATURE_SEP;
191 if (disable_tsc)
192 *edx &= ~X86_FEATURE_TSC;
193 if (disable_mtrr)
194 *edx &= ~X86_FEATURE_MTRR;
195 }
196}
197
198static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
199{
200 if (gdt[nr].a != new->a || gdt[nr].b != new->b)
201 write_gdt_entry(gdt, nr, new->a, new->b);
202}
203
204static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
205{
206 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
207 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
208 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
209 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
210}
211
212static void vmi_set_ldt(const void *addr, unsigned entries)
213{
214 unsigned cpu = smp_processor_id();
215 u32 low, high;
216
217 pack_descriptor(&low, &high, (unsigned long)addr,
218 entries * sizeof(struct desc_struct) - 1,
219 DESCTYPE_LDT, 0);
220 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
221 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
222}
223
224static void vmi_set_tr(void)
225{
226 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
227}
228
229static void vmi_load_esp0(struct tss_struct *tss,
230 struct thread_struct *thread)
231{
232 tss->esp0 = thread->esp0;
233
234 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
235 if (unlikely(tss->ss1 != thread->sysenter_cs)) {
236 tss->ss1 = thread->sysenter_cs;
237 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
238 }
239 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->esp0);
240}
241
242static void vmi_flush_tlb_user(void)
243{
244 vmi_ops.flush_tlb(VMI_FLUSH_TLB);
245}
246
247static void vmi_flush_tlb_kernel(void)
248{
249 vmi_ops.flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
250}
251
252/* Stub to do nothing at all; used for delays and unimplemented calls */
253static void vmi_nop(void)
254{
255}
256
257/* For NO_IDLE_HZ, we stop the clock when halting the kernel */
258#ifdef CONFIG_NO_IDLE_HZ
259static fastcall void vmi_safe_halt(void)
260{
261 int idle = vmi_stop_hz_timer();
262 vmi_ops.halt();
263 if (idle) {
264 local_irq_disable();
265 vmi_account_time_restart_hz_timer();
266 local_irq_enable();
267 }
268}
269#endif
270
271#ifdef CONFIG_DEBUG_PAGE_TYPE
272
273#ifdef CONFIG_X86_PAE
274#define MAX_BOOT_PTS (2048+4+1)
275#else
276#define MAX_BOOT_PTS (1024+1)
277#endif
278
279/*
280 * During boot, mem_map is not yet available in paging_init, so stash
281 * all the boot page allocations here.
282 */
283static struct {
284 u32 pfn;
285 int type;
286} boot_page_allocations[MAX_BOOT_PTS];
287static int num_boot_page_allocations;
288static int boot_allocations_applied;
289
290void vmi_apply_boot_page_allocations(void)
291{
292 int i;
293 BUG_ON(!mem_map);
294 for (i = 0; i < num_boot_page_allocations; i++) {
295 struct page *page = pfn_to_page(boot_page_allocations[i].pfn);
296 page->type = boot_page_allocations[i].type;
297 page->type = boot_page_allocations[i].type &
298 ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
299 }
300 boot_allocations_applied = 1;
301}
302
303static void record_page_type(u32 pfn, int type)
304{
305 BUG_ON(num_boot_page_allocations >= MAX_BOOT_PTS);
306 boot_page_allocations[num_boot_page_allocations].pfn = pfn;
307 boot_page_allocations[num_boot_page_allocations].type = type;
308 num_boot_page_allocations++;
309}
310
311static void check_zeroed_page(u32 pfn, int type, struct page *page)
312{
313 u32 *ptr;
314 int i;
315 int limit = PAGE_SIZE / sizeof(int);
316
317 if (page_address(page))
318 ptr = (u32 *)page_address(page);
319 else
320 ptr = (u32 *)__va(pfn << PAGE_SHIFT);
321 /*
322 * When cloning the root in non-PAE mode, only the userspace
323 * pdes need to be zeroed.
324 */
325 if (type & VMI_PAGE_CLONE)
326 limit = USER_PTRS_PER_PGD;
327 for (i = 0; i < limit; i++)
328 BUG_ON(ptr[i]);
329}
330
331/*
332 * We stash the page type into struct page so we can verify the page
333 * types are used properly.
334 */
335static void vmi_set_page_type(u32 pfn, int type)
336{
337 /* PAE can have multiple roots per page - don't track */
338 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
339 return;
340
341 if (boot_allocations_applied) {
342 struct page *page = pfn_to_page(pfn);
343 if (type != VMI_PAGE_NORMAL)
344 BUG_ON(page->type);
345 else
346 BUG_ON(page->type == VMI_PAGE_NORMAL);
347 page->type = type & ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
348 if (type & VMI_PAGE_ZEROED)
349 check_zeroed_page(pfn, type, page);
350 } else {
351 record_page_type(pfn, type);
352 }
353}
354
355static void vmi_check_page_type(u32 pfn, int type)
356{
357 /* PAE can have multiple roots per page - skip checks */
358 if (PTRS_PER_PMD > 1 && (type & VMI_PAGE_PDP))
359 return;
360
361 type &= ~(VMI_PAGE_ZEROED | VMI_PAGE_CLONE);
362 if (boot_allocations_applied) {
363 struct page *page = pfn_to_page(pfn);
364 BUG_ON((page->type ^ type) & VMI_PAGE_PAE);
365 BUG_ON(type == VMI_PAGE_NORMAL && page->type);
366 BUG_ON((type & page->type) == 0);
367 }
368}
369#else
370#define vmi_set_page_type(p,t) do { } while (0)
371#define vmi_check_page_type(p,t) do { } while (0)
372#endif
373
374static void vmi_allocate_pt(u32 pfn)
375{
376 vmi_set_page_type(pfn, VMI_PAGE_L1);
377 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
378}
379
380static void vmi_allocate_pd(u32 pfn)
381{
382 /*
383 * This call comes in very early, before mem_map is setup.
384 * It is called only for swapper_pg_dir, which already has
385 * data on it.
386 */
387 vmi_set_page_type(pfn, VMI_PAGE_L2);
388 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
389}
390
391static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
392{
393 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
394 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
395 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
396}
397
398static void vmi_release_pt(u32 pfn)
399{
400 vmi_ops.release_page(pfn, VMI_PAGE_L1);
401 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
402}
403
404static void vmi_release_pd(u32 pfn)
405{
406 vmi_ops.release_page(pfn, VMI_PAGE_L2);
407 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
408}
409
410/*
411 * Helper macros for MMU update flags. We can defer updates until a flush
412 * or page invalidation only if the update is to the current address space
413 * (otherwise, there is no flush). We must check against init_mm, since
414 * this could be a kernel update, which usually passes init_mm, although
415 * sometimes this check can be skipped if we know the particular function
416 * is only called on user mode PTEs. We could change the kernel to pass
417 * current->active_mm here, but in particular, I was unsure if changing
418 * mm/highmem.c to do this would still be correct on other architectures.
419 */
420#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
421 (!mustbeuser && (mm) == &init_mm))
422#define vmi_flags_addr(mm, addr, level, user) \
423 ((level) | (is_current_as(mm, user) ? \
424 (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
425#define vmi_flags_addr_defer(mm, addr, level, user) \
426 ((level) | (is_current_as(mm, user) ? \
427 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
428
429static void vmi_update_pte(struct mm_struct *mm, u32 addr, pte_t *ptep)
430{
431 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
432 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
433}
434
435static void vmi_update_pte_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
436{
437 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
438 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
439}
440
441static void vmi_set_pte(pte_t *ptep, pte_t pte)
442{
443 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
444 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE | VMI_PAGE_PD);
445 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
446}
447
448static void vmi_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
449{
450 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
451 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
452}
453
454static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
455{
456#ifdef CONFIG_X86_PAE
457 const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 };
458 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD);
459#else
460 const pte_t pte = { pmdval.pud.pgd.pgd };
461 vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PGD);
462#endif
463 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
464}
465
466#ifdef CONFIG_X86_PAE
467
468static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
469{
470 /*
471 * XXX This is called from set_pmd_pte, but at both PT
472 * and PD layers so the VMI_PAGE_PT flag is wrong. But
473 * it is only called for large page mapping changes,
474 * the Xen backend, doesn't support large pages, and the
475 * ESX backend doesn't depend on the flag.
476 */
477 set_64bit((unsigned long long *)ptep,pte_val(pteval));
478 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
479}
480
481static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
482{
483 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
484 vmi_ops.set_pte(pte, ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 1));
485}
486
487static void vmi_set_pud(pud_t *pudp, pud_t pudval)
488{
489 /* Um, eww */
490 const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 };
491 vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD);
492 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
493}
494
495static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
496{
497 const pte_t pte = { 0 };
498 vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE);
499 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
500}
501
502void vmi_pmd_clear(pmd_t *pmd)
503{
504 const pte_t pte = { 0 };
505 vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD);
506 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
507}
508#endif
509
510#ifdef CONFIG_SMP
511struct vmi_ap_state ap;
512extern void setup_pda(void);
513
514static void __init /* XXX cpu hotplug */
515vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
516 unsigned long start_esp)
517{
518 /* Default everything to zero. This is fine for most GPRs. */
519 memset(&ap, 0, sizeof(struct vmi_ap_state));
520
521 ap.gdtr_limit = GDT_SIZE - 1;
522 ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
523
524 ap.idtr_limit = IDT_ENTRIES * 8 - 1;
525 ap.idtr_base = (unsigned long) idt_table;
526
527 ap.ldtr = 0;
528
529 ap.cs = __KERNEL_CS;
530 ap.eip = (unsigned long) start_eip;
531 ap.ss = __KERNEL_DS;
532 ap.esp = (unsigned long) start_esp;
533
534 ap.ds = __USER_DS;
535 ap.es = __USER_DS;
536 ap.fs = __KERNEL_PDA;
537 ap.gs = 0;
538
539 ap.eflags = 0;
540
541 setup_pda();
542
543#ifdef CONFIG_X86_PAE
544 /* efer should match BSP efer. */
545 if (cpu_has_nx) {
546 unsigned l, h;
547 rdmsr(MSR_EFER, l, h);
548 ap.efer = (unsigned long long) h << 32 | l;
549 }
550#endif
551
552 ap.cr3 = __pa(swapper_pg_dir);
553 /* Protected mode, paging, AM, WP, NE, MP. */
554 ap.cr0 = 0x80050023;
555 ap.cr4 = mmu_cr4_features;
556 vmi_ops.set_initial_ap_state(__pa(&ap), phys_apicid);
557}
558#endif
559
560static inline int __init check_vmi_rom(struct vrom_header *rom)
561{
562 struct pci_header *pci;
563 struct pnp_header *pnp;
564 const char *manufacturer = "UNKNOWN";
565 const char *product = "UNKNOWN";
566 const char *license = "unspecified";
567
568 if (rom->rom_signature != 0xaa55)
569 return 0;
570 if (rom->vrom_signature != VMI_SIGNATURE)
571 return 0;
572 if (rom->api_version_maj != VMI_API_REV_MAJOR ||
573 rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
574 printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
575 rom->api_version_maj,
576 rom->api_version_min);
577 return 0;
578 }
579
580 /*
581 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
582 * the PCI header and device type to make sure this is really a
583 * VMI device.
584 */
585 if (!rom->pci_header_offs) {
586 printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
587 return 0;
588 }
589
590 pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
591 if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
592 pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
593 /* Allow it to run... anyways, but warn */
594 printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
595 }
596
597 if (rom->pnp_header_offs) {
598 pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
599 if (pnp->manufacturer_offset)
600 manufacturer = (const char *)rom+pnp->manufacturer_offset;
601 if (pnp->product_offset)
602 product = (const char *)rom+pnp->product_offset;
603 }
604
605 if (rom->license_offs)
606 license = (char *)rom+rom->license_offs;
607
608 printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
609 manufacturer, product,
610 rom->api_version_maj, rom->api_version_min,
611 pci->rom_version_maj, pci->rom_version_min);
612
613 license_gplok = license_is_gpl_compatible(license);
614 if (!license_gplok) {
615 printk(KERN_WARNING "VMI: ROM license '%s' taints kernel... "
616 "inlining disabled\n",
617 license);
618 add_taint(TAINT_PROPRIETARY_MODULE);
619 }
620 return 1;
621}
622
623/*
624 * Probe for the VMI option ROM
625 */
626static inline int __init probe_vmi_rom(void)
627{
628 unsigned long base;
629
630 /* VMI ROM is in option ROM area, check signature */
631 for (base = 0xC0000; base < 0xE0000; base += 2048) {
632 struct vrom_header *romstart;
633 romstart = (struct vrom_header *)isa_bus_to_virt(base);
634 if (check_vmi_rom(romstart)) {
635 vmi_rom = romstart;
636 return 1;
637 }
638 }
639 return 0;
640}
641
642/*
643 * VMI setup common to all processors
644 */
645void vmi_bringup(void)
646{
647 /* We must establish the lowmem mapping for MMU ops to work */
648 if (vmi_rom)
649 vmi_ops.set_linear_mapping(0, __PAGE_OFFSET, max_low_pfn, 0);
650}
651
652/*
653 * Return a pointer to the VMI function or a NOP stub
654 */
655static void *vmi_get_function(int vmicall)
656{
657 u64 reloc;
658 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
659 reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
660 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
661 if (rel->type == VMI_RELOCATION_CALL_REL)
662 return (void *)rel->eip;
663 else
664 return (void *)vmi_nop;
665}
666
667/*
668 * Helper macro for making the VMI paravirt-ops fill code readable.
669 * For unimplemented operations, fall back to default.
670 */
671#define para_fill(opname, vmicall) \
672do { \
673 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
674 VMI_CALL_##vmicall); \
675 if (rel->type != VMI_RELOCATION_NONE) { \
676 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL); \
677 paravirt_ops.opname = (void *)rel->eip; \
678 } \
679} while (0)
680
681/*
682 * Activate the VMI interface and switch into paravirtualized mode
683 */
684static inline int __init activate_vmi(void)
685{
686 short kernel_cs;
687 u64 reloc;
688 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
689
690 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
691 printk(KERN_ERR "VMI ROM failed to initialize!");
692 return 0;
693 }
694 savesegment(cs, kernel_cs);
695
696 paravirt_ops.paravirt_enabled = 1;
697 paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
698
699 paravirt_ops.patch = vmi_patch;
700 paravirt_ops.name = "vmi";
701
702 /*
703 * Many of these operations are ABI compatible with VMI.
704 * This means we can fill in the paravirt-ops with direct
705 * pointers into the VMI ROM. If the calling convention for
706 * these operations changes, this code needs to be updated.
707 *
708 * Exceptions
709 * CPUID paravirt-op uses pointers, not the native ISA
710 * halt has no VMI equivalent; all VMI halts are "safe"
711 * no MSR support yet - just trap and emulate. VMI uses the
712 * same ABI as the native ISA, but Linux wants exceptions
713 * from bogus MSR read / write handled
714 * rdpmc is not yet used in Linux
715 */
716
717 /* CPUID is special, so very special */
718 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_CPUID);
719 if (rel->type != VMI_RELOCATION_NONE) {
720 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
721 vmi_ops.cpuid = (void *)rel->eip;
722 paravirt_ops.cpuid = vmi_cpuid;
723 }
724
725 para_fill(clts, CLTS);
726 para_fill(get_debugreg, GetDR);
727 para_fill(set_debugreg, SetDR);
728 para_fill(read_cr0, GetCR0);
729 para_fill(read_cr2, GetCR2);
730 para_fill(read_cr3, GetCR3);
731 para_fill(read_cr4, GetCR4);
732 para_fill(write_cr0, SetCR0);
733 para_fill(write_cr2, SetCR2);
734 para_fill(write_cr3, SetCR3);
735 para_fill(write_cr4, SetCR4);
736 para_fill(save_fl, GetInterruptMask);
737 para_fill(restore_fl, SetInterruptMask);
738 para_fill(irq_disable, DisableInterrupts);
739 para_fill(irq_enable, EnableInterrupts);
740 /* irq_save_disable !!! sheer pain */
741 patch_offset(&irq_save_disable_callout[IRQ_PATCH_INT_MASK],
742 (char *)paravirt_ops.save_fl);
743 patch_offset(&irq_save_disable_callout[IRQ_PATCH_DISABLE],
744 (char *)paravirt_ops.irq_disable);
745#ifndef CONFIG_NO_IDLE_HZ
746 para_fill(safe_halt, Halt);
747#else
748 vmi_ops.halt = vmi_get_function(VMI_CALL_Halt);
749 paravirt_ops.safe_halt = vmi_safe_halt;
750#endif
751 para_fill(wbinvd, WBINVD);
752 /* paravirt_ops.read_msr = vmi_rdmsr */
753 /* paravirt_ops.write_msr = vmi_wrmsr */
754 para_fill(read_tsc, RDTSC);
755 /* paravirt_ops.rdpmc = vmi_rdpmc */
756
757 /* TR interface doesn't pass TR value */
758 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetTR);
759 if (rel->type != VMI_RELOCATION_NONE) {
760 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
761 vmi_ops.set_tr = (void *)rel->eip;
762 paravirt_ops.load_tr_desc = vmi_set_tr;
763 }
764
765 /* LDT is special, too */
766 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_SetLDT);
767 if (rel->type != VMI_RELOCATION_NONE) {
768 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
769 vmi_ops._set_ldt = (void *)rel->eip;
770 paravirt_ops.set_ldt = vmi_set_ldt;
771 }
772
773 para_fill(load_gdt, SetGDT);
774 para_fill(load_idt, SetIDT);
775 para_fill(store_gdt, GetGDT);
776 para_fill(store_idt, GetIDT);
777 para_fill(store_tr, GetTR);
778 paravirt_ops.load_tls = vmi_load_tls;
779 para_fill(write_ldt_entry, WriteLDTEntry);
780 para_fill(write_gdt_entry, WriteGDTEntry);
781 para_fill(write_idt_entry, WriteIDTEntry);
782 reloc = call_vrom_long_func(vmi_rom, get_reloc,
783 VMI_CALL_UpdateKernelStack);
784 if (rel->type != VMI_RELOCATION_NONE) {
785 BUG_ON(rel->type != VMI_RELOCATION_CALL_REL);
786 vmi_ops.set_kernel_stack = (void *)rel->eip;
787 paravirt_ops.load_esp0 = vmi_load_esp0;
788 }
789
790 para_fill(set_iopl_mask, SetIOPLMask);
791 paravirt_ops.io_delay = (void *)vmi_nop;
792 if (!disable_nodelay) {
793 paravirt_ops.const_udelay = (void *)vmi_nop;
794 }
795
796 para_fill(set_lazy_mode, SetLazyMode);
797
798 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_FlushTLB);
799 if (rel->type != VMI_RELOCATION_NONE) {
800 vmi_ops.flush_tlb = (void *)rel->eip;
801 paravirt_ops.flush_tlb_user = vmi_flush_tlb_user;
802 paravirt_ops.flush_tlb_kernel = vmi_flush_tlb_kernel;
803 }
804 para_fill(flush_tlb_single, InvalPage);
805
806 /*
807 * Until a standard flag format can be agreed on, we need to
808 * implement these as wrappers in Linux. Get the VMI ROM
809 * function pointers for the two backend calls.
810 */
811#ifdef CONFIG_X86_PAE
812 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
813 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
814#else
815 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
816 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
817#endif
818 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
819 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
820 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
821
822 paravirt_ops.alloc_pt = vmi_allocate_pt;
823 paravirt_ops.alloc_pd = vmi_allocate_pd;
824 paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone;
825 paravirt_ops.release_pt = vmi_release_pt;
826 paravirt_ops.release_pd = vmi_release_pd;
827 paravirt_ops.set_pte = vmi_set_pte;
828 paravirt_ops.set_pte_at = vmi_set_pte_at;
829 paravirt_ops.set_pmd = vmi_set_pmd;
830 paravirt_ops.pte_update = vmi_update_pte;
831 paravirt_ops.pte_update_defer = vmi_update_pte_defer;
832#ifdef CONFIG_X86_PAE
833 paravirt_ops.set_pte_atomic = vmi_set_pte_atomic;
834 paravirt_ops.set_pte_present = vmi_set_pte_present;
835 paravirt_ops.set_pud = vmi_set_pud;
836 paravirt_ops.pte_clear = vmi_pte_clear;
837 paravirt_ops.pmd_clear = vmi_pmd_clear;
838#endif
839 /*
840 * These MUST always be patched. Don't support indirect jumps
841 * through these operations, as the VMI interface may use either
842 * a jump or a call to get to these operations, depending on
843 * the backend. They are performance critical anyway, so requiring
844 * a patch is not a big problem.
845 */
846 paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0;
847 paravirt_ops.iret = (void *)0xbadbab0;
848
849#ifdef CONFIG_SMP
850 paravirt_ops.startup_ipi_hook = vmi_startup_ipi_hook;
851 vmi_ops.set_initial_ap_state = vmi_get_function(VMI_CALL_SetInitialAPState);
852#endif
853
854#ifdef CONFIG_X86_LOCAL_APIC
855 paravirt_ops.apic_read = vmi_get_function(VMI_CALL_APICRead);
856 paravirt_ops.apic_write = vmi_get_function(VMI_CALL_APICWrite);
857 paravirt_ops.apic_write_atomic = vmi_get_function(VMI_CALL_APICWrite);
858#endif
859
860 /*
861 * Check for VMI timer functionality by probing for a cycle frequency method
862 */
863 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
864 if (rel->type != VMI_RELOCATION_NONE) {
865 vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
866 vmi_timer_ops.get_cycle_counter =
867 vmi_get_function(VMI_CALL_GetCycleCounter);
868 vmi_timer_ops.get_wallclock =
869 vmi_get_function(VMI_CALL_GetWallclockTime);
870 vmi_timer_ops.wallclock_updated =
871 vmi_get_function(VMI_CALL_WallclockUpdated);
872 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
873 vmi_timer_ops.cancel_alarm =
874 vmi_get_function(VMI_CALL_CancelAlarm);
875 paravirt_ops.time_init = vmi_time_init;
876 paravirt_ops.get_wallclock = vmi_get_wallclock;
877 paravirt_ops.set_wallclock = vmi_set_wallclock;
878#ifdef CONFIG_X86_LOCAL_APIC
879 paravirt_ops.setup_boot_clock = vmi_timer_setup_boot_alarm;
880 paravirt_ops.setup_secondary_clock = vmi_timer_setup_secondary_alarm;
881#endif
882 custom_sched_clock = vmi_sched_clock;
883 }
884
885 /*
886 * Alternative instruction rewriting doesn't happen soon enough
887 * to convert VMI_IRET to a call instead of a jump; so we have
888 * to do this before IRQs get reenabled. Fortunately, it is
889 * idempotent.
890 */
891 apply_paravirt(__start_parainstructions, __stop_parainstructions);
892
893 vmi_bringup();
894
895 return 1;
896}
897
898#undef para_fill
899
900void __init vmi_init(void)
901{
902 unsigned long flags;
903
904 if (!vmi_rom)
905 probe_vmi_rom();
906 else
907 check_vmi_rom(vmi_rom);
908
909 /* In case probing for or validating the ROM failed, basil */
910 if (!vmi_rom)
911 return;
912
913 reserve_top_address(-vmi_rom->virtual_top);
914
915 local_irq_save(flags);
916 activate_vmi();
917#ifdef CONFIG_SMP
918 no_timer_check = 1;
919#endif
920 local_irq_restore(flags & X86_EFLAGS_IF);
921}
922
923static int __init parse_vmi(char *arg)
924{
925 if (!arg)
926 return -EINVAL;
927
928 if (!strcmp(arg, "disable_nodelay"))
929 disable_nodelay = 1;
930 else if (!strcmp(arg, "disable_pge")) {
931 clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
932 disable_pge = 1;
933 } else if (!strcmp(arg, "disable_pse")) {
934 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
935 disable_pse = 1;
936 } else if (!strcmp(arg, "disable_sep")) {
937 clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability);
938 disable_sep = 1;
939 } else if (!strcmp(arg, "disable_tsc")) {
940 clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
941 disable_tsc = 1;
942 } else if (!strcmp(arg, "disable_mtrr")) {
943 clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability);
944 disable_mtrr = 1;
945 }
946 return 0;
947}
948
949early_param("vmi", parse_vmi);
diff --git a/arch/i386/kernel/vmitime.c b/arch/i386/kernel/vmitime.c
new file mode 100644
index 000000000000..2e2d8dbcbd68
--- /dev/null
+++ b/arch/i386/kernel/vmitime.c
@@ -0,0 +1,499 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to dhecht@vmware.com
22 *
23 */
24
25/*
26 * Portions of this code from arch/i386/kernel/timers/timer_tsc.c.
27 * Portions of the CONFIG_NO_IDLE_HZ code from arch/s390/kernel/time.c.
28 * See comments there for proper credits.
29 */
30
31#include <linux/spinlock.h>
32#include <linux/init.h>
33#include <linux/errno.h>
34#include <linux/jiffies.h>
35#include <linux/interrupt.h>
36#include <linux/kernel_stat.h>
37#include <linux/rcupdate.h>
38#include <linux/clocksource.h>
39
40#include <asm/timer.h>
41#include <asm/io.h>
42#include <asm/apic.h>
43#include <asm/div64.h>
44#include <asm/timer.h>
45#include <asm/desc.h>
46
47#include <asm/vmi.h>
48#include <asm/vmi_time.h>
49
50#include <mach_timer.h>
51#include <io_ports.h>
52
53#ifdef CONFIG_X86_LOCAL_APIC
54#define VMI_ALARM_WIRING VMI_ALARM_WIRED_LVTT
55#else
56#define VMI_ALARM_WIRING VMI_ALARM_WIRED_IRQ0
57#endif
58
59/* Cached VMI operations */
60struct vmi_timer_ops vmi_timer_ops;
61
62#ifdef CONFIG_NO_IDLE_HZ
63
64/* /proc/sys/kernel/hz_timer state. */
65int sysctl_hz_timer;
66
67/* Some stats */
68static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_irqs);
69static DEFINE_PER_CPU(unsigned long, vmi_idle_no_hz_jiffies);
70static DEFINE_PER_CPU(unsigned long, idle_start_jiffies);
71
72#endif /* CONFIG_NO_IDLE_HZ */
73
74/* Number of alarms per second. By default this is CONFIG_VMI_ALARM_HZ. */
75static int alarm_hz = CONFIG_VMI_ALARM_HZ;
76
77/* Cache of the value get_cycle_frequency / HZ. */
78static signed long long cycles_per_jiffy;
79
80/* Cache of the value get_cycle_frequency / alarm_hz. */
81static signed long long cycles_per_alarm;
82
83/* The number of cycles accounted for by the 'jiffies'/'xtime' count.
84 * Protected by xtime_lock. */
85static unsigned long long real_cycles_accounted_system;
86
87/* The number of cycles accounted for by update_process_times(), per cpu. */
88static DEFINE_PER_CPU(unsigned long long, process_times_cycles_accounted_cpu);
89
90/* The number of stolen cycles accounted, per cpu. */
91static DEFINE_PER_CPU(unsigned long long, stolen_cycles_accounted_cpu);
92
93/* Clock source. */
94static cycle_t read_real_cycles(void)
95{
96 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
97}
98
99static cycle_t read_available_cycles(void)
100{
101 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE);
102}
103
104#if 0
105static cycle_t read_stolen_cycles(void)
106{
107 return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_STOLEN);
108}
109#endif /* 0 */
110
111static struct clocksource clocksource_vmi = {
112 .name = "vmi-timer",
113 .rating = 450,
114 .read = read_real_cycles,
115 .mask = CLOCKSOURCE_MASK(64),
116 .mult = 0, /* to be set */
117 .shift = 22,
118 .is_continuous = 1,
119};
120
121
122/* Timer interrupt handler. */
123static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id);
124
125static struct irqaction vmi_timer_irq = {
126 vmi_timer_interrupt,
127 SA_INTERRUPT,
128 CPU_MASK_NONE,
129 "VMI-alarm",
130 NULL,
131 NULL
132};
133
134/* Alarm rate */
135static int __init vmi_timer_alarm_rate_setup(char* str)
136{
137 int alarm_rate;
138 if (get_option(&str, &alarm_rate) == 1 && alarm_rate > 0) {
139 alarm_hz = alarm_rate;
140 printk(KERN_WARNING "VMI timer alarm HZ set to %d\n", alarm_hz);
141 }
142 return 1;
143}
144__setup("vmi_timer_alarm_hz=", vmi_timer_alarm_rate_setup);
145
146
147/* Initialization */
148static void vmi_get_wallclock_ts(struct timespec *ts)
149{
150 unsigned long long wallclock;
151 wallclock = vmi_timer_ops.get_wallclock(); // nsec units
152 ts->tv_nsec = do_div(wallclock, 1000000000);
153 ts->tv_sec = wallclock;
154}
155
156static void update_xtime_from_wallclock(void)
157{
158 struct timespec ts;
159 vmi_get_wallclock_ts(&ts);
160 do_settimeofday(&ts);
161}
162
163unsigned long vmi_get_wallclock(void)
164{
165 struct timespec ts;
166 vmi_get_wallclock_ts(&ts);
167 return ts.tv_sec;
168}
169
170int vmi_set_wallclock(unsigned long now)
171{
172 return -1;
173}
174
175unsigned long long vmi_sched_clock(void)
176{
177 return read_available_cycles();
178}
179
180void __init vmi_time_init(void)
181{
182 unsigned long long cycles_per_sec, cycles_per_msec;
183 unsigned long flags;
184
185 local_irq_save(flags);
186 setup_irq(0, &vmi_timer_irq);
187#ifdef CONFIG_X86_LOCAL_APIC
188 set_intr_gate(LOCAL_TIMER_VECTOR, apic_vmi_timer_interrupt);
189#endif
190
191 no_sync_cmos_clock = 1;
192
193 vmi_get_wallclock_ts(&xtime);
194 set_normalized_timespec(&wall_to_monotonic,
195 -xtime.tv_sec, -xtime.tv_nsec);
196
197 real_cycles_accounted_system = read_real_cycles();
198 update_xtime_from_wallclock();
199 per_cpu(process_times_cycles_accounted_cpu, 0) = read_available_cycles();
200
201 cycles_per_sec = vmi_timer_ops.get_cycle_frequency();
202
203 cycles_per_jiffy = cycles_per_sec;
204 (void)do_div(cycles_per_jiffy, HZ);
205 cycles_per_alarm = cycles_per_sec;
206 (void)do_div(cycles_per_alarm, alarm_hz);
207 cycles_per_msec = cycles_per_sec;
208 (void)do_div(cycles_per_msec, 1000);
209 cpu_khz = cycles_per_msec;
210
211 printk(KERN_WARNING "VMI timer cycles/sec = %llu ; cycles/jiffy = %llu ;"
212 "cycles/alarm = %llu\n", cycles_per_sec, cycles_per_jiffy,
213 cycles_per_alarm);
214
215 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
216 clocksource_vmi.shift);
217 if (clocksource_register(&clocksource_vmi))
218 printk(KERN_WARNING "Error registering VMITIME clocksource.");
219
220 /* Disable PIT. */
221 outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
222
223 /* schedule the alarm. do this in phase with process_times_cycles_accounted_cpu
224 * reduce the latency calling update_process_times. */
225 vmi_timer_ops.set_alarm(
226 VMI_ALARM_WIRED_IRQ0 | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
227 per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
228 cycles_per_alarm);
229
230 local_irq_restore(flags);
231}
232
233#ifdef CONFIG_X86_LOCAL_APIC
234
235void __init vmi_timer_setup_boot_alarm(void)
236{
237 local_irq_disable();
238
239 /* Route the interrupt to the correct vector. */
240 apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
241
242 /* Cancel the IRQ0 wired alarm, and setup the LVTT alarm. */
243 vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
244 vmi_timer_ops.set_alarm(
245 VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
246 per_cpu(process_times_cycles_accounted_cpu, 0) + cycles_per_alarm,
247 cycles_per_alarm);
248 local_irq_enable();
249}
250
251/* Initialize the time accounting variables for an AP on an SMP system.
252 * Also, set the local alarm for the AP. */
253void __init vmi_timer_setup_secondary_alarm(void)
254{
255 int cpu = smp_processor_id();
256
257 /* Route the interrupt to the correct vector. */
258 apic_write_around(APIC_LVTT, LOCAL_TIMER_VECTOR);
259
260 per_cpu(process_times_cycles_accounted_cpu, cpu) = read_available_cycles();
261
262 vmi_timer_ops.set_alarm(
263 VMI_ALARM_WIRED_LVTT | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
264 per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
265 cycles_per_alarm);
266}
267
268#endif
269
270/* Update system wide (real) time accounting (e.g. jiffies, xtime). */
271static void vmi_account_real_cycles(unsigned long long cur_real_cycles)
272{
273 long long cycles_not_accounted;
274
275 write_seqlock(&xtime_lock);
276
277 cycles_not_accounted = cur_real_cycles - real_cycles_accounted_system;
278 while (cycles_not_accounted >= cycles_per_jiffy) {
279 /* systems wide jiffies and wallclock. */
280 do_timer(1);
281
282 cycles_not_accounted -= cycles_per_jiffy;
283 real_cycles_accounted_system += cycles_per_jiffy;
284 }
285
286 if (vmi_timer_ops.wallclock_updated())
287 update_xtime_from_wallclock();
288
289 write_sequnlock(&xtime_lock);
290}
291
292/* Update per-cpu process times. */
293static void vmi_account_process_times_cycles(struct pt_regs *regs, int cpu,
294 unsigned long long cur_process_times_cycles)
295{
296 long long cycles_not_accounted;
297 cycles_not_accounted = cur_process_times_cycles -
298 per_cpu(process_times_cycles_accounted_cpu, cpu);
299
300 while (cycles_not_accounted >= cycles_per_jiffy) {
301 /* Account time to the current process. This includes
302 * calling into the scheduler to decrement the timeslice
303 * and possibly reschedule.*/
304 update_process_times(user_mode(regs));
305 /* XXX handle /proc/profile multiplier. */
306 profile_tick(CPU_PROFILING);
307
308 cycles_not_accounted -= cycles_per_jiffy;
309 per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
310 }
311}
312
313#ifdef CONFIG_NO_IDLE_HZ
314/* Update per-cpu idle times. Used when a no-hz halt is ended. */
315static void vmi_account_no_hz_idle_cycles(int cpu,
316 unsigned long long cur_process_times_cycles)
317{
318 long long cycles_not_accounted;
319 unsigned long no_idle_hz_jiffies = 0;
320
321 cycles_not_accounted = cur_process_times_cycles -
322 per_cpu(process_times_cycles_accounted_cpu, cpu);
323
324 while (cycles_not_accounted >= cycles_per_jiffy) {
325 no_idle_hz_jiffies++;
326 cycles_not_accounted -= cycles_per_jiffy;
327 per_cpu(process_times_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
328 }
329 /* Account time to the idle process. */
330 account_steal_time(idle_task(cpu), jiffies_to_cputime(no_idle_hz_jiffies));
331}
332#endif
333
334/* Update per-cpu stolen time. */
335static void vmi_account_stolen_cycles(int cpu,
336 unsigned long long cur_real_cycles,
337 unsigned long long cur_avail_cycles)
338{
339 long long stolen_cycles_not_accounted;
340 unsigned long stolen_jiffies = 0;
341
342 if (cur_real_cycles < cur_avail_cycles)
343 return;
344
345 stolen_cycles_not_accounted = cur_real_cycles - cur_avail_cycles -
346 per_cpu(stolen_cycles_accounted_cpu, cpu);
347
348 while (stolen_cycles_not_accounted >= cycles_per_jiffy) {
349 stolen_jiffies++;
350 stolen_cycles_not_accounted -= cycles_per_jiffy;
351 per_cpu(stolen_cycles_accounted_cpu, cpu) += cycles_per_jiffy;
352 }
353 /* HACK: pass NULL to force time onto cpustat->steal. */
354 account_steal_time(NULL, jiffies_to_cputime(stolen_jiffies));
355}
356
357/* Body of either IRQ0 interrupt handler (UP no local-APIC) or
358 * local-APIC LVTT interrupt handler (UP & local-APIC or SMP). */
359static void vmi_local_timer_interrupt(int cpu)
360{
361 unsigned long long cur_real_cycles, cur_process_times_cycles;
362
363 cur_real_cycles = read_real_cycles();
364 cur_process_times_cycles = read_available_cycles();
365 /* Update system wide (real) time state (xtime, jiffies). */
366 vmi_account_real_cycles(cur_real_cycles);
367 /* Update per-cpu process times. */
368 vmi_account_process_times_cycles(get_irq_regs(), cpu, cur_process_times_cycles);
369 /* Update time stolen from this cpu by the hypervisor. */
370 vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
371}
372
373#ifdef CONFIG_NO_IDLE_HZ
374
375/* Must be called only from idle loop, with interrupts disabled. */
376int vmi_stop_hz_timer(void)
377{
378 /* Note that cpu_set, cpu_clear are (SMP safe) atomic on x86. */
379
380 unsigned long seq, next;
381 unsigned long long real_cycles_expiry;
382 int cpu = smp_processor_id();
383 int idle;
384
385 BUG_ON(!irqs_disabled());
386 if (sysctl_hz_timer != 0)
387 return 0;
388
389 cpu_set(cpu, nohz_cpu_mask);
390 smp_mb();
391 if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
392 (next = next_timer_interrupt(), time_before_eq(next, jiffies))) {
393 cpu_clear(cpu, nohz_cpu_mask);
394 next = jiffies;
395 idle = 0;
396 } else
397 idle = 1;
398
399 /* Convert jiffies to the real cycle counter. */
400 do {
401 seq = read_seqbegin(&xtime_lock);
402 real_cycles_expiry = real_cycles_accounted_system +
403 (long)(next - jiffies) * cycles_per_jiffy;
404 } while (read_seqretry(&xtime_lock, seq));
405
406 /* This cpu is going idle. Disable the periodic alarm. */
407 if (idle) {
408 vmi_timer_ops.cancel_alarm(VMI_CYCLES_AVAILABLE);
409 per_cpu(idle_start_jiffies, cpu) = jiffies;
410 }
411
412 /* Set the real time alarm to expire at the next event. */
413 vmi_timer_ops.set_alarm(
414 VMI_ALARM_WIRING | VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL,
415 real_cycles_expiry, 0);
416
417 return idle;
418}
419
420static void vmi_reenable_hz_timer(int cpu)
421{
422 /* For /proc/vmi/info idle_hz stat. */
423 per_cpu(vmi_idle_no_hz_jiffies, cpu) += jiffies - per_cpu(idle_start_jiffies, cpu);
424 per_cpu(vmi_idle_no_hz_irqs, cpu)++;
425
426 /* Don't bother explicitly cancelling the one-shot alarm -- at
427 * worse we will receive a spurious timer interrupt. */
428 vmi_timer_ops.set_alarm(
429 VMI_ALARM_WIRING | VMI_ALARM_IS_PERIODIC | VMI_CYCLES_AVAILABLE,
430 per_cpu(process_times_cycles_accounted_cpu, cpu) + cycles_per_alarm,
431 cycles_per_alarm);
432 /* Indicate this cpu is no longer nohz idle. */
433 cpu_clear(cpu, nohz_cpu_mask);
434}
435
436/* Called from interrupt handlers when (local) HZ timer is disabled. */
437void vmi_account_time_restart_hz_timer(void)
438{
439 unsigned long long cur_real_cycles, cur_process_times_cycles;
440 int cpu = smp_processor_id();
441
442 BUG_ON(!irqs_disabled());
443 /* Account the time during which the HZ timer was disabled. */
444 cur_real_cycles = read_real_cycles();
445 cur_process_times_cycles = read_available_cycles();
446 /* Update system wide (real) time state (xtime, jiffies). */
447 vmi_account_real_cycles(cur_real_cycles);
448 /* Update per-cpu idle times. */
449 vmi_account_no_hz_idle_cycles(cpu, cur_process_times_cycles);
450 /* Update time stolen from this cpu by the hypervisor. */
451 vmi_account_stolen_cycles(cpu, cur_real_cycles, cur_process_times_cycles);
452 /* Reenable the hz timer. */
453 vmi_reenable_hz_timer(cpu);
454}
455
456#endif /* CONFIG_NO_IDLE_HZ */
457
458/* UP (and no local-APIC) VMI-timer alarm interrupt handler.
459 * Handler for IRQ0. Not used when SMP or X86_LOCAL_APIC after
460 * APIC setup and setup_boot_vmi_alarm() is called. */
461static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
462{
463 vmi_local_timer_interrupt(smp_processor_id());
464 return IRQ_HANDLED;
465}
466
467#ifdef CONFIG_X86_LOCAL_APIC
468
469/* SMP VMI-timer alarm interrupt handler. Handler for LVTT vector.
470 * Also used in UP when CONFIG_X86_LOCAL_APIC.
471 * The wrapper code is from arch/i386/kernel/apic.c#smp_apic_timer_interrupt. */
472void smp_apic_vmi_timer_interrupt(struct pt_regs *regs)
473{
474 struct pt_regs *old_regs = set_irq_regs(regs);
475 int cpu = smp_processor_id();
476
477 /*
478 * the NMI deadlock-detector uses this.
479 */
480 per_cpu(irq_stat,cpu).apic_timer_irqs++;
481
482 /*
483 * NOTE! We'd better ACK the irq immediately,
484 * because timer handling can be slow.
485 */
486 ack_APIC_irq();
487
488 /*
489 * update_process_times() expects us to have done irq_enter().
490 * Besides, if we don't timer interrupts ignore the global
491 * interrupt lock, which is the WrongThing (tm) to do.
492 */
493 irq_enter();
494 vmi_local_timer_interrupt(cpu);
495 irq_exit();
496 set_irq_regs(old_regs);
497}
498
499#endif /* CONFIG_X86_LOCAL_APIC */
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 5038a73d554e..ca51610955df 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -37,9 +37,14 @@ SECTIONS
37{ 37{
38 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; 38 . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
39 phys_startup_32 = startup_32 - LOAD_OFFSET; 39 phys_startup_32 = startup_32 - LOAD_OFFSET;
40
41 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
42 _text = .; /* Text and read-only data */
43 *(.text.head)
44 } :text = 0x9090
45
40 /* read-only */ 46 /* read-only */
41 .text : AT(ADDR(.text) - LOAD_OFFSET) { 47 .text : AT(ADDR(.text) - LOAD_OFFSET) {
42 _text = .; /* Text and read-only data */
43 *(.text) 48 *(.text)
44 SCHED_TEXT 49 SCHED_TEXT
45 LOCK_TEXT 50 LOCK_TEXT