aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/mips/au1000/Kconfig1
-rw-r--r--arch/mips/au1000/common/irq.c251
-rw-r--r--arch/mips/au1000/common/power.c7
-rw-r--r--arch/mips/au1000/pb1200/irqmap.c2
-rw-r--r--arch/mips/configs/mtx1_defconfig2
-rw-r--r--arch/mips/kernel/head.S4
-rw-r--r--arch/mips/kernel/time.c47
-rw-r--r--arch/mips/kernel/traps.c164
-rw-r--r--arch/mips/sgi-ip22/ip22-time.c9
-rw-r--r--arch/mips/sibyte/bcm1480/time.c2
-rw-r--r--arch/mips/sibyte/sb1250/time.c8
-rw-r--r--arch/x86/kernel/alternative.c4
-rw-r--r--arch/x86/kernel/asm-offsets_32.c14
-rw-r--r--arch/x86/kernel/entry_32.S2
-rw-r--r--arch/x86/kernel/paravirt_32.c224
-rw-r--r--arch/x86/kernel/vmi_32.c201
-rw-r--r--arch/x86/mm/init_32.c22
-rw-r--r--arch/x86/xen/enlighten.c232
-rw-r--r--arch/x86/xen/mmu.c144
-rw-r--r--arch/x86/xen/multicalls.c52
-rw-r--r--arch/x86/xen/multicalls.h5
-rw-r--r--arch/x86/xen/smp.c14
-rw-r--r--arch/x86/xen/time.c6
-rw-r--r--arch/x86/xen/xen-ops.h10
24 files changed, 876 insertions, 551 deletions
diff --git a/arch/mips/au1000/Kconfig b/arch/mips/au1000/Kconfig
index 29c95d97217d..a23d4154da01 100644
--- a/arch/mips/au1000/Kconfig
+++ b/arch/mips/au1000/Kconfig
@@ -137,6 +137,7 @@ config SOC_AU1200
137config SOC_AU1X00 137config SOC_AU1X00
138 bool 138 bool
139 select 64BIT_PHYS_ADDR 139 select 64BIT_PHYS_ADDR
140 select IRQ_CPU
140 select SYS_HAS_CPU_MIPS32_R1 141 select SYS_HAS_CPU_MIPS32_R1
141 select SYS_SUPPORTS_32BIT_KERNEL 142 select SYS_SUPPORTS_32BIT_KERNEL
142 select SYS_SUPPORTS_APM_EMULATION 143 select SYS_SUPPORTS_APM_EMULATION
diff --git a/arch/mips/au1000/common/irq.c b/arch/mips/au1000/common/irq.c
index c00f308fd505..59e932a928d2 100644
--- a/arch/mips/au1000/common/irq.c
+++ b/arch/mips/au1000/common/irq.c
@@ -1,11 +1,10 @@
1/* 1/*
2 * BRIEF MODULE DESCRIPTION
3 * Au1000 interrupt routines.
4 *
5 * Copyright 2001 MontaVista Software Inc. 2 * Copyright 2001 MontaVista Software Inc.
6 * Author: MontaVista Software, Inc. 3 * Author: MontaVista Software, Inc.
7 * ppopov@mvista.com or source@mvista.com 4 * ppopov@mvista.com or source@mvista.com
8 * 5 *
6 * Copyright (C) 2007 Ralf Baechle (ralf@linux-mips.org)
7 *
9 * This program is free software; you can redistribute it and/or modify it 8 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU General Public License as published by the 9 * under the terms of the GNU General Public License as published by the
11 * Free Software Foundation; either version 2 of the License, or (at your 10 * Free Software Foundation; either version 2 of the License, or (at your
@@ -32,6 +31,7 @@
32#include <linux/interrupt.h> 31#include <linux/interrupt.h>
33#include <linux/irq.h> 32#include <linux/irq.h>
34 33
34#include <asm/irq_cpu.h>
35#include <asm/mipsregs.h> 35#include <asm/mipsregs.h>
36#include <asm/mach-au1x00/au1000.h> 36#include <asm/mach-au1x00/au1000.h>
37#ifdef CONFIG_MIPS_PB1000 37#ifdef CONFIG_MIPS_PB1000
@@ -44,7 +44,7 @@
44#define EXT_INTC1_REQ1 5 /* IP 5 */ 44#define EXT_INTC1_REQ1 5 /* IP 5 */
45#define MIPS_TIMER_IP 7 /* IP 7 */ 45#define MIPS_TIMER_IP 7 /* IP 7 */
46 46
47void (*board_init_irq)(void); 47void (*board_init_irq)(void) __initdata = NULL;
48 48
49static DEFINE_SPINLOCK(irq_lock); 49static DEFINE_SPINLOCK(irq_lock);
50 50
@@ -134,12 +134,14 @@ void restore_au1xxx_intctl(void)
134 134
135inline void local_enable_irq(unsigned int irq_nr) 135inline void local_enable_irq(unsigned int irq_nr)
136{ 136{
137 if (irq_nr > AU1000_LAST_INTC0_INT) { 137 unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE;
138 au_writel(1 << (irq_nr - 32), IC1_MASKSET); 138
139 au_writel(1 << (irq_nr - 32), IC1_WAKESET); 139 if (bit >= 32) {
140 au_writel(1 << (bit - 32), IC1_MASKSET);
141 au_writel(1 << (bit - 32), IC1_WAKESET);
140 } else { 142 } else {
141 au_writel(1 << irq_nr, IC0_MASKSET); 143 au_writel(1 << bit, IC0_MASKSET);
142 au_writel(1 << irq_nr, IC0_WAKESET); 144 au_writel(1 << bit, IC0_WAKESET);
143 } 145 }
144 au_sync(); 146 au_sync();
145} 147}
@@ -147,12 +149,14 @@ inline void local_enable_irq(unsigned int irq_nr)
147 149
148inline void local_disable_irq(unsigned int irq_nr) 150inline void local_disable_irq(unsigned int irq_nr)
149{ 151{
150 if (irq_nr > AU1000_LAST_INTC0_INT) { 152 unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE;
151 au_writel(1 << (irq_nr - 32), IC1_MASKCLR); 153
152 au_writel(1 << (irq_nr - 32), IC1_WAKECLR); 154 if (bit >= 32) {
155 au_writel(1 << (bit - 32), IC1_MASKCLR);
156 au_writel(1 << (bit - 32), IC1_WAKECLR);
153 } else { 157 } else {
154 au_writel(1 << irq_nr, IC0_MASKCLR); 158 au_writel(1 << bit, IC0_MASKCLR);
155 au_writel(1 << irq_nr, IC0_WAKECLR); 159 au_writel(1 << bit, IC0_WAKECLR);
156 } 160 }
157 au_sync(); 161 au_sync();
158} 162}
@@ -160,12 +164,14 @@ inline void local_disable_irq(unsigned int irq_nr)
160 164
161static inline void mask_and_ack_rise_edge_irq(unsigned int irq_nr) 165static inline void mask_and_ack_rise_edge_irq(unsigned int irq_nr)
162{ 166{
163 if (irq_nr > AU1000_LAST_INTC0_INT) { 167 unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE;
164 au_writel(1 << (irq_nr - 32), IC1_RISINGCLR); 168
165 au_writel(1 << (irq_nr - 32), IC1_MASKCLR); 169 if (bit >= 32) {
170 au_writel(1 << (bit - 32), IC1_RISINGCLR);
171 au_writel(1 << (bit - 32), IC1_MASKCLR);
166 } else { 172 } else {
167 au_writel(1 << irq_nr, IC0_RISINGCLR); 173 au_writel(1 << bit, IC0_RISINGCLR);
168 au_writel(1 << irq_nr, IC0_MASKCLR); 174 au_writel(1 << bit, IC0_MASKCLR);
169 } 175 }
170 au_sync(); 176 au_sync();
171} 177}
@@ -173,12 +179,14 @@ static inline void mask_and_ack_rise_edge_irq(unsigned int irq_nr)
173 179
174static inline void mask_and_ack_fall_edge_irq(unsigned int irq_nr) 180static inline void mask_and_ack_fall_edge_irq(unsigned int irq_nr)
175{ 181{
176 if (irq_nr > AU1000_LAST_INTC0_INT) { 182 unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE;
177 au_writel(1 << (irq_nr - 32), IC1_FALLINGCLR); 183
178 au_writel(1 << (irq_nr - 32), IC1_MASKCLR); 184 if (bit >= 32) {
185 au_writel(1 << (bit - 32), IC1_FALLINGCLR);
186 au_writel(1 << (bit - 32), IC1_MASKCLR);
179 } else { 187 } else {
180 au_writel(1 << irq_nr, IC0_FALLINGCLR); 188 au_writel(1 << bit, IC0_FALLINGCLR);
181 au_writel(1 << irq_nr, IC0_MASKCLR); 189 au_writel(1 << bit, IC0_MASKCLR);
182 } 190 }
183 au_sync(); 191 au_sync();
184} 192}
@@ -186,17 +194,20 @@ static inline void mask_and_ack_fall_edge_irq(unsigned int irq_nr)
186 194
187static inline void mask_and_ack_either_edge_irq(unsigned int irq_nr) 195static inline void mask_and_ack_either_edge_irq(unsigned int irq_nr)
188{ 196{
189 /* This may assume that we don't get interrupts from 197 unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE;
198
199 /*
200 * This may assume that we don't get interrupts from
190 * both edges at once, or if we do, that we don't care. 201 * both edges at once, or if we do, that we don't care.
191 */ 202 */
192 if (irq_nr > AU1000_LAST_INTC0_INT) { 203 if (bit >= 32) {
193 au_writel(1 << (irq_nr - 32), IC1_FALLINGCLR); 204 au_writel(1 << (bit - 32), IC1_FALLINGCLR);
194 au_writel(1 << (irq_nr - 32), IC1_RISINGCLR); 205 au_writel(1 << (bit - 32), IC1_RISINGCLR);
195 au_writel(1 << (irq_nr - 32), IC1_MASKCLR); 206 au_writel(1 << (bit - 32), IC1_MASKCLR);
196 } else { 207 } else {
197 au_writel(1 << irq_nr, IC0_FALLINGCLR); 208 au_writel(1 << bit, IC0_FALLINGCLR);
198 au_writel(1 << irq_nr, IC0_RISINGCLR); 209 au_writel(1 << bit, IC0_RISINGCLR);
199 au_writel(1 << irq_nr, IC0_MASKCLR); 210 au_writel(1 << bit, IC0_MASKCLR);
200 } 211 }
201 au_sync(); 212 au_sync();
202} 213}
@@ -213,10 +224,8 @@ static inline void mask_and_ack_level_irq(unsigned int irq_nr)
213 au_sync(); 224 au_sync();
214 } 225 }
215#endif 226#endif
216 return;
217} 227}
218 228
219
220static void end_irq(unsigned int irq_nr) 229static void end_irq(unsigned int irq_nr)
221{ 230{
222 if (!(irq_desc[irq_nr].status & (IRQ_DISABLED | IRQ_INPROGRESS))) 231 if (!(irq_desc[irq_nr].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
@@ -341,114 +350,118 @@ void startup_match20_interrupt(irq_handler_t handler)
341} 350}
342#endif 351#endif
343 352
344static void setup_local_irq(unsigned int irq_nr, int type, int int_req) 353static void __init setup_local_irq(unsigned int irq_nr, int type, int int_req)
345{ 354{
346 if (irq_nr > AU1000_MAX_INTR) return; 355 unsigned int bit = irq_nr - AU1000_INTC0_INT_BASE;
356
357 if (irq_nr > AU1000_MAX_INTR)
358 return;
359
347 /* Config2[n], Config1[n], Config0[n] */ 360 /* Config2[n], Config1[n], Config0[n] */
348 if (irq_nr > AU1000_LAST_INTC0_INT) { 361 if (bit >= 32) {
349 switch (type) { 362 switch (type) {
350 case INTC_INT_RISE_EDGE: /* 0:0:1 */ 363 case INTC_INT_RISE_EDGE: /* 0:0:1 */
351 au_writel(1 << (irq_nr - 32), IC1_CFG2CLR); 364 au_writel(1 << (bit - 32), IC1_CFG2CLR);
352 au_writel(1 << (irq_nr - 32), IC1_CFG1CLR); 365 au_writel(1 << (bit - 32), IC1_CFG1CLR);
353 au_writel(1 << (irq_nr - 32), IC1_CFG0SET); 366 au_writel(1 << (bit - 32), IC1_CFG0SET);
354 set_irq_chip(irq_nr, &rise_edge_irq_type); 367 set_irq_chip(irq_nr, &rise_edge_irq_type);
355 break; 368 break;
356 case INTC_INT_FALL_EDGE: /* 0:1:0 */ 369 case INTC_INT_FALL_EDGE: /* 0:1:0 */
357 au_writel(1 << (irq_nr - 32), IC1_CFG2CLR); 370 au_writel(1 << (bit - 32), IC1_CFG2CLR);
358 au_writel(1 << (irq_nr - 32), IC1_CFG1SET); 371 au_writel(1 << (bit - 32), IC1_CFG1SET);
359 au_writel(1 << (irq_nr - 32), IC1_CFG0CLR); 372 au_writel(1 << (bit - 32), IC1_CFG0CLR);
360 set_irq_chip(irq_nr, &fall_edge_irq_type); 373 set_irq_chip(irq_nr, &fall_edge_irq_type);
361 break; 374 break;
362 case INTC_INT_RISE_AND_FALL_EDGE: /* 0:1:1 */ 375 case INTC_INT_RISE_AND_FALL_EDGE: /* 0:1:1 */
363 au_writel(1 << (irq_nr - 32), IC1_CFG2CLR); 376 au_writel(1 << (bit - 32), IC1_CFG2CLR);
364 au_writel(1 << (irq_nr - 32), IC1_CFG1SET); 377 au_writel(1 << (bit - 32), IC1_CFG1SET);
365 au_writel(1 << (irq_nr - 32), IC1_CFG0SET); 378 au_writel(1 << (bit - 32), IC1_CFG0SET);
366 set_irq_chip(irq_nr, &either_edge_irq_type); 379 set_irq_chip(irq_nr, &either_edge_irq_type);
367 break; 380 break;
368 case INTC_INT_HIGH_LEVEL: /* 1:0:1 */ 381 case INTC_INT_HIGH_LEVEL: /* 1:0:1 */
369 au_writel(1 << (irq_nr - 32), IC1_CFG2SET); 382 au_writel(1 << (bit - 32), IC1_CFG2SET);
370 au_writel(1 << (irq_nr - 32), IC1_CFG1CLR); 383 au_writel(1 << (bit - 32), IC1_CFG1CLR);
371 au_writel(1 << (irq_nr - 32), IC1_CFG0SET); 384 au_writel(1 << (bit - 32), IC1_CFG0SET);
372 set_irq_chip(irq_nr, &level_irq_type); 385 set_irq_chip(irq_nr, &level_irq_type);
373 break; 386 break;
374 case INTC_INT_LOW_LEVEL: /* 1:1:0 */ 387 case INTC_INT_LOW_LEVEL: /* 1:1:0 */
375 au_writel(1 << (irq_nr - 32), IC1_CFG2SET); 388 au_writel(1 << (bit - 32), IC1_CFG2SET);
376 au_writel(1 << (irq_nr - 32), IC1_CFG1SET); 389 au_writel(1 << (bit - 32), IC1_CFG1SET);
377 au_writel(1 << (irq_nr - 32), IC1_CFG0CLR); 390 au_writel(1 << (bit - 32), IC1_CFG0CLR);
378 set_irq_chip(irq_nr, &level_irq_type); 391 set_irq_chip(irq_nr, &level_irq_type);
379 break; 392 break;
380 case INTC_INT_DISABLED: /* 0:0:0 */ 393 case INTC_INT_DISABLED: /* 0:0:0 */
381 au_writel(1 << (irq_nr - 32), IC1_CFG0CLR); 394 au_writel(1 << (bit - 32), IC1_CFG0CLR);
382 au_writel(1 << (irq_nr - 32), IC1_CFG1CLR); 395 au_writel(1 << (bit - 32), IC1_CFG1CLR);
383 au_writel(1 << (irq_nr - 32), IC1_CFG2CLR); 396 au_writel(1 << (bit - 32), IC1_CFG2CLR);
384 break; 397 break;
385 default: /* disable the interrupt */ 398 default: /* disable the interrupt */
386 printk(KERN_WARNING "unexpected int type %d (irq %d)\n", 399 printk(KERN_WARNING "unexpected int type %d (irq %d)\n",
387 type, irq_nr); 400 type, irq_nr);
388 au_writel(1 << (irq_nr - 32), IC1_CFG0CLR); 401 au_writel(1 << (bit - 32), IC1_CFG0CLR);
389 au_writel(1 << (irq_nr - 32), IC1_CFG1CLR); 402 au_writel(1 << (bit - 32), IC1_CFG1CLR);
390 au_writel(1 << (irq_nr - 32), IC1_CFG2CLR); 403 au_writel(1 << (bit - 32), IC1_CFG2CLR);
391 return; 404 return;
392 } 405 }
393 if (int_req) /* assign to interrupt request 1 */ 406 if (int_req) /* assign to interrupt request 1 */
394 au_writel(1 << (irq_nr - 32), IC1_ASSIGNCLR); 407 au_writel(1 << (bit - 32), IC1_ASSIGNCLR);
395 else /* assign to interrupt request 0 */ 408 else /* assign to interrupt request 0 */
396 au_writel(1 << (irq_nr - 32), IC1_ASSIGNSET); 409 au_writel(1 << (bit - 32), IC1_ASSIGNSET);
397 au_writel(1 << (irq_nr - 32), IC1_SRCSET); 410 au_writel(1 << (bit - 32), IC1_SRCSET);
398 au_writel(1 << (irq_nr - 32), IC1_MASKCLR); 411 au_writel(1 << (bit - 32), IC1_MASKCLR);
399 au_writel(1 << (irq_nr - 32), IC1_WAKECLR); 412 au_writel(1 << (bit - 32), IC1_WAKECLR);
400 } else { 413 } else {
401 switch (type) { 414 switch (type) {
402 case INTC_INT_RISE_EDGE: /* 0:0:1 */ 415 case INTC_INT_RISE_EDGE: /* 0:0:1 */
403 au_writel(1 << irq_nr, IC0_CFG2CLR); 416 au_writel(1 << bit, IC0_CFG2CLR);
404 au_writel(1 << irq_nr, IC0_CFG1CLR); 417 au_writel(1 << bit, IC0_CFG1CLR);
405 au_writel(1 << irq_nr, IC0_CFG0SET); 418 au_writel(1 << bit, IC0_CFG0SET);
406 set_irq_chip(irq_nr, &rise_edge_irq_type); 419 set_irq_chip(irq_nr, &rise_edge_irq_type);
407 break; 420 break;
408 case INTC_INT_FALL_EDGE: /* 0:1:0 */ 421 case INTC_INT_FALL_EDGE: /* 0:1:0 */
409 au_writel(1 << irq_nr, IC0_CFG2CLR); 422 au_writel(1 << bit, IC0_CFG2CLR);
410 au_writel(1 << irq_nr, IC0_CFG1SET); 423 au_writel(1 << bit, IC0_CFG1SET);
411 au_writel(1 << irq_nr, IC0_CFG0CLR); 424 au_writel(1 << bit, IC0_CFG0CLR);
412 set_irq_chip(irq_nr, &fall_edge_irq_type); 425 set_irq_chip(irq_nr, &fall_edge_irq_type);
413 break; 426 break;
414 case INTC_INT_RISE_AND_FALL_EDGE: /* 0:1:1 */ 427 case INTC_INT_RISE_AND_FALL_EDGE: /* 0:1:1 */
415 au_writel(1 << irq_nr, IC0_CFG2CLR); 428 au_writel(1 << bit, IC0_CFG2CLR);
416 au_writel(1 << irq_nr, IC0_CFG1SET); 429 au_writel(1 << bit, IC0_CFG1SET);
417 au_writel(1 << irq_nr, IC0_CFG0SET); 430 au_writel(1 << bit, IC0_CFG0SET);
418 set_irq_chip(irq_nr, &either_edge_irq_type); 431 set_irq_chip(irq_nr, &either_edge_irq_type);
419 break; 432 break;
420 case INTC_INT_HIGH_LEVEL: /* 1:0:1 */ 433 case INTC_INT_HIGH_LEVEL: /* 1:0:1 */
421 au_writel(1 << irq_nr, IC0_CFG2SET); 434 au_writel(1 << bit, IC0_CFG2SET);
422 au_writel(1 << irq_nr, IC0_CFG1CLR); 435 au_writel(1 << bit, IC0_CFG1CLR);
423 au_writel(1 << irq_nr, IC0_CFG0SET); 436 au_writel(1 << bit, IC0_CFG0SET);
424 set_irq_chip(irq_nr, &level_irq_type); 437 set_irq_chip(irq_nr, &level_irq_type);
425 break; 438 break;
426 case INTC_INT_LOW_LEVEL: /* 1:1:0 */ 439 case INTC_INT_LOW_LEVEL: /* 1:1:0 */
427 au_writel(1 << irq_nr, IC0_CFG2SET); 440 au_writel(1 << bit, IC0_CFG2SET);
428 au_writel(1 << irq_nr, IC0_CFG1SET); 441 au_writel(1 << bit, IC0_CFG1SET);
429 au_writel(1 << irq_nr, IC0_CFG0CLR); 442 au_writel(1 << bit, IC0_CFG0CLR);
430 set_irq_chip(irq_nr, &level_irq_type); 443 set_irq_chip(irq_nr, &level_irq_type);
431 break; 444 break;
432 case INTC_INT_DISABLED: /* 0:0:0 */ 445 case INTC_INT_DISABLED: /* 0:0:0 */
433 au_writel(1 << irq_nr, IC0_CFG0CLR); 446 au_writel(1 << bit, IC0_CFG0CLR);
434 au_writel(1 << irq_nr, IC0_CFG1CLR); 447 au_writel(1 << bit, IC0_CFG1CLR);
435 au_writel(1 << irq_nr, IC0_CFG2CLR); 448 au_writel(1 << bit, IC0_CFG2CLR);
436 break; 449 break;
437 default: /* disable the interrupt */ 450 default: /* disable the interrupt */
438 printk(KERN_WARNING "unexpected int type %d (irq %d)\n", 451 printk(KERN_WARNING "unexpected int type %d (irq %d)\n",
439 type, irq_nr); 452 type, irq_nr);
440 au_writel(1 << irq_nr, IC0_CFG0CLR); 453 au_writel(1 << bit, IC0_CFG0CLR);
441 au_writel(1 << irq_nr, IC0_CFG1CLR); 454 au_writel(1 << bit, IC0_CFG1CLR);
442 au_writel(1 << irq_nr, IC0_CFG2CLR); 455 au_writel(1 << bit, IC0_CFG2CLR);
443 return; 456 return;
444 } 457 }
445 if (int_req) /* assign to interrupt request 1 */ 458 if (int_req) /* assign to interrupt request 1 */
446 au_writel(1 << irq_nr, IC0_ASSIGNCLR); 459 au_writel(1 << bit, IC0_ASSIGNCLR);
447 else /* assign to interrupt request 0 */ 460 else /* assign to interrupt request 0 */
448 au_writel(1 << irq_nr, IC0_ASSIGNSET); 461 au_writel(1 << bit, IC0_ASSIGNSET);
449 au_writel(1 << irq_nr, IC0_SRCSET); 462 au_writel(1 << bit, IC0_SRCSET);
450 au_writel(1 << irq_nr, IC0_MASKCLR); 463 au_writel(1 << bit, IC0_MASKCLR);
451 au_writel(1 << irq_nr, IC0_WAKECLR); 464 au_writel(1 << bit, IC0_WAKECLR);
452 } 465 }
453 au_sync(); 466 au_sync();
454} 467}
@@ -461,8 +474,8 @@ static void setup_local_irq(unsigned int irq_nr, int type, int int_req)
461 474
462static void intc0_req0_irqdispatch(void) 475static void intc0_req0_irqdispatch(void)
463{ 476{
464 int irq = 0;
465 static unsigned long intc0_req0; 477 static unsigned long intc0_req0;
478 unsigned int bit;
466 479
467 intc0_req0 |= au_readl(IC0_REQ0INT); 480 intc0_req0 |= au_readl(IC0_REQ0INT);
468 481
@@ -481,25 +494,25 @@ static void intc0_req0_irqdispatch(void)
481 return; 494 return;
482 } 495 }
483#endif 496#endif
484 irq = ffs(intc0_req0); 497 bit = ffs(intc0_req0);
485 intc0_req0 &= ~(1 << irq); 498 intc0_req0 &= ~(1 << bit);
486 do_IRQ(irq); 499 do_IRQ(MIPS_CPU_IRQ_BASE + bit);
487} 500}
488 501
489 502
490static void intc0_req1_irqdispatch(void) 503static void intc0_req1_irqdispatch(void)
491{ 504{
492 int irq = 0;
493 static unsigned long intc0_req1; 505 static unsigned long intc0_req1;
506 unsigned int bit;
494 507
495 intc0_req1 |= au_readl(IC0_REQ1INT); 508 intc0_req1 |= au_readl(IC0_REQ1INT);
496 509
497 if (!intc0_req1) 510 if (!intc0_req1)
498 return; 511 return;
499 512
500 irq = ffs(intc0_req1); 513 bit = ffs(intc0_req1);
501 intc0_req1 &= ~(1 << irq); 514 intc0_req1 &= ~(1 << bit);
502 do_IRQ(irq); 515 do_IRQ(bit);
503} 516}
504 517
505 518
@@ -509,43 +522,41 @@ static void intc0_req1_irqdispatch(void)
509 */ 522 */
510static void intc1_req0_irqdispatch(void) 523static void intc1_req0_irqdispatch(void)
511{ 524{
512 int irq = 0;
513 static unsigned long intc1_req0; 525 static unsigned long intc1_req0;
526 unsigned int bit;
514 527
515 intc1_req0 |= au_readl(IC1_REQ0INT); 528 intc1_req0 |= au_readl(IC1_REQ0INT);
516 529
517 if (!intc1_req0) 530 if (!intc1_req0)
518 return; 531 return;
519 532
520 irq = ffs(intc1_req0); 533 bit = ffs(intc1_req0);
521 intc1_req0 &= ~(1 << irq); 534 intc1_req0 &= ~(1 << bit);
522 irq += 32; 535 do_IRQ(MIPS_CPU_IRQ_BASE + 32 + bit);
523 do_IRQ(irq);
524} 536}
525 537
526 538
527static void intc1_req1_irqdispatch(void) 539static void intc1_req1_irqdispatch(void)
528{ 540{
529 int irq = 0;
530 static unsigned long intc1_req1; 541 static unsigned long intc1_req1;
542 unsigned int bit;
531 543
532 intc1_req1 |= au_readl(IC1_REQ1INT); 544 intc1_req1 |= au_readl(IC1_REQ1INT);
533 545
534 if (!intc1_req1) 546 if (!intc1_req1)
535 return; 547 return;
536 548
537 irq = ffs(intc1_req1); 549 bit = ffs(intc1_req1);
538 intc1_req1 &= ~(1 << irq); 550 intc1_req1 &= ~(1 << bit);
539 irq += 32; 551 do_IRQ(MIPS_CPU_IRQ_BASE + 32 + bit);
540 do_IRQ(irq);
541} 552}
542 553
543asmlinkage void plat_irq_dispatch(void) 554asmlinkage void plat_irq_dispatch(void)
544{ 555{
545 unsigned int pending = read_c0_status() & read_c0_cause() & ST0_IM; 556 unsigned int pending = read_c0_status() & read_c0_cause();
546 557
547 if (pending & CAUSEF_IP7) 558 if (pending & CAUSEF_IP7)
548 do_IRQ(63); 559 do_IRQ(MIPS_CPU_IRQ_BASE + 7);
549 else if (pending & CAUSEF_IP2) 560 else if (pending & CAUSEF_IP2)
550 intc0_req0_irqdispatch(); 561 intc0_req0_irqdispatch();
551 else if (pending & CAUSEF_IP3) 562 else if (pending & CAUSEF_IP3)
@@ -561,17 +572,15 @@ asmlinkage void plat_irq_dispatch(void)
561void __init arch_init_irq(void) 572void __init arch_init_irq(void)
562{ 573{
563 int i; 574 int i;
564 unsigned long cp0_status;
565 struct au1xxx_irqmap *imp; 575 struct au1xxx_irqmap *imp;
566 extern struct au1xxx_irqmap au1xxx_irq_map[]; 576 extern struct au1xxx_irqmap au1xxx_irq_map[];
567 extern struct au1xxx_irqmap au1xxx_ic0_map[]; 577 extern struct au1xxx_irqmap au1xxx_ic0_map[];
568 extern int au1xxx_nr_irqs; 578 extern int au1xxx_nr_irqs;
569 extern int au1xxx_ic0_nr_irqs; 579 extern int au1xxx_ic0_nr_irqs;
570 580
571 cp0_status = read_c0_status(); 581 /*
572 582 * Initialize interrupt controllers to a safe state.
573 /* Initialize interrupt controllers to a safe state. 583 */
574 */
575 au_writel(0xffffffff, IC0_CFG0CLR); 584 au_writel(0xffffffff, IC0_CFG0CLR);
576 au_writel(0xffffffff, IC0_CFG1CLR); 585 au_writel(0xffffffff, IC0_CFG1CLR);
577 au_writel(0xffffffff, IC0_CFG2CLR); 586 au_writel(0xffffffff, IC0_CFG2CLR);
@@ -594,16 +603,20 @@ void __init arch_init_irq(void)
594 au_writel(0xffffffff, IC1_RISINGCLR); 603 au_writel(0xffffffff, IC1_RISINGCLR);
595 au_writel(0x00000000, IC1_TESTBIT); 604 au_writel(0x00000000, IC1_TESTBIT);
596 605
597 /* Initialize IC0, which is fixed per processor. 606 mips_cpu_irq_init();
598 */ 607
608 /*
609 * Initialize IC0, which is fixed per processor.
610 */
599 imp = au1xxx_ic0_map; 611 imp = au1xxx_ic0_map;
600 for (i = 0; i < au1xxx_ic0_nr_irqs; i++) { 612 for (i = 0; i < au1xxx_ic0_nr_irqs; i++) {
601 setup_local_irq(imp->im_irq, imp->im_type, imp->im_request); 613 setup_local_irq(imp->im_irq, imp->im_type, imp->im_request);
602 imp++; 614 imp++;
603 } 615 }
604 616
605 /* Now set up the irq mapping for the board. 617 /*
606 */ 618 * Now set up the irq mapping for the board.
619 */
607 imp = au1xxx_irq_map; 620 imp = au1xxx_irq_map;
608 for (i = 0; i < au1xxx_nr_irqs; i++) { 621 for (i = 0; i < au1xxx_nr_irqs; i++) {
609 setup_local_irq(imp->im_irq, imp->im_type, imp->im_request); 622 setup_local_irq(imp->im_irq, imp->im_type, imp->im_request);
@@ -615,5 +628,5 @@ void __init arch_init_irq(void)
615 /* Board specific IRQ initialization. 628 /* Board specific IRQ initialization.
616 */ 629 */
617 if (board_init_irq) 630 if (board_init_irq)
618 (*board_init_irq)(); 631 board_init_irq();
619} 632}
diff --git a/arch/mips/au1000/common/power.c b/arch/mips/au1000/common/power.c
index 6f57f72a7d57..54047d69b820 100644
--- a/arch/mips/au1000/common/power.c
+++ b/arch/mips/au1000/common/power.c
@@ -403,9 +403,9 @@ static int pm_do_freq(ctl_table * ctl, int write, struct file *file,
403 } 403 }
404 404
405 405
406 /* We don't want _any_ interrupts other than 406 /*
407 * match20. Otherwise our au1000_calibrate_delay() 407 * We don't want _any_ interrupts other than match20. Otherwise our
408 * calculation will be off, potentially a lot. 408 * au1000_calibrate_delay() calculation will be off, potentially a lot.
409 */ 409 */
410 intc0_mask = save_local_and_disable(0); 410 intc0_mask = save_local_and_disable(0);
411 intc1_mask = save_local_and_disable(1); 411 intc1_mask = save_local_and_disable(1);
@@ -414,6 +414,7 @@ static int pm_do_freq(ctl_table * ctl, int write, struct file *file,
414 au1000_calibrate_delay(); 414 au1000_calibrate_delay();
415 restore_local_and_enable(0, intc0_mask); 415 restore_local_and_enable(0, intc0_mask);
416 restore_local_and_enable(1, intc1_mask); 416 restore_local_and_enable(1, intc1_mask);
417
417 return retval; 418 return retval;
418} 419}
419 420
diff --git a/arch/mips/au1000/pb1200/irqmap.c b/arch/mips/au1000/pb1200/irqmap.c
index 3bee274445f5..5f48b0603796 100644
--- a/arch/mips/au1000/pb1200/irqmap.c
+++ b/arch/mips/au1000/pb1200/irqmap.c
@@ -74,7 +74,7 @@ irqreturn_t pb1200_cascade_handler( int irq, void *dev_id)
74 bcsr->int_status = bisr; 74 bcsr->int_status = bisr;
75 for( ; bisr; bisr &= (bisr-1) ) 75 for( ; bisr; bisr &= (bisr-1) )
76 { 76 {
77 extirq_nr = PB1200_INT_BEGIN + au_ffs(bisr); 77 extirq_nr = PB1200_INT_BEGIN + ffs(bisr);
78 /* Ack and dispatch IRQ */ 78 /* Ack and dispatch IRQ */
79 do_IRQ(extirq_nr); 79 do_IRQ(extirq_nr);
80 } 80 }
diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig
index 0280ef389d8d..b536d7c63790 100644
--- a/arch/mips/configs/mtx1_defconfig
+++ b/arch/mips/configs/mtx1_defconfig
@@ -3021,7 +3021,7 @@ CONFIG_MAGIC_SYSRQ=y
3021# CONFIG_DEBUG_FS is not set 3021# CONFIG_DEBUG_FS is not set
3022# CONFIG_HEADERS_CHECK is not set 3022# CONFIG_HEADERS_CHECK is not set
3023# CONFIG_DEBUG_KERNEL is not set 3023# CONFIG_DEBUG_KERNEL is not set
3024# CONFIG_CROSSCOMPILE is not set 3024CONFIG_CROSSCOMPILE=y
3025CONFIG_CMDLINE="" 3025CONFIG_CMDLINE=""
3026CONFIG_SYS_SUPPORTS_KGDB=y 3026CONFIG_SYS_SUPPORTS_KGDB=y
3027 3027
diff --git a/arch/mips/kernel/head.S b/arch/mips/kernel/head.S
index e46782b0ebc8..bf164a562acb 100644
--- a/arch/mips/kernel/head.S
+++ b/arch/mips/kernel/head.S
@@ -140,7 +140,7 @@
140 140
141EXPORT(_stext) 141EXPORT(_stext)
142 142
143#ifndef CONFIG_BOOT_RAW 143#ifdef CONFIG_BOOT_RAW
144 /* 144 /*
145 * Give us a fighting chance of running if execution beings at the 145 * Give us a fighting chance of running if execution beings at the
146 * kernel load address. This is needed because this platform does 146 * kernel load address. This is needed because this platform does
@@ -149,6 +149,8 @@ EXPORT(_stext)
149 __INIT 149 __INIT
150#endif 150#endif
151 151
152 __INIT_REFOK
153
152NESTED(kernel_entry, 16, sp) # kernel entry point 154NESTED(kernel_entry, 16, sp) # kernel entry point
153 155
154 kernel_entry_setup # cpu specific setup 156 kernel_entry_setup # cpu specific setup
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index 05b365167a09..e4b5e647b142 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -391,6 +391,50 @@ static void mips_event_handler(struct clock_event_device *dev)
391{ 391{
392} 392}
393 393
394/*
395 * FIXME: This doesn't hold for the relocated E9000 compare interrupt.
396 */
397static int c0_compare_int_pending(void)
398{
399 return (read_c0_cause() >> cp0_compare_irq) & 0x100;
400}
401
402static int c0_compare_int_usable(void)
403{
404 const unsigned int delta = 0x300000;
405 unsigned int cnt;
406
407 /*
408 * IP7 already pending? Try to clear it by acking the timer.
409 */
410 if (c0_compare_int_pending()) {
411 write_c0_compare(read_c0_compare());
412 irq_disable_hazard();
413 if (c0_compare_int_pending())
414 return 0;
415 }
416
417 cnt = read_c0_count();
418 cnt += delta;
419 write_c0_compare(cnt);
420
421 while ((long)(read_c0_count() - cnt) <= 0)
422 ; /* Wait for expiry */
423
424 if (!c0_compare_int_pending())
425 return 0;
426
427 write_c0_compare(read_c0_compare());
428 irq_disable_hazard();
429 if (c0_compare_int_pending())
430 return 0;
431
432 /*
433 * Feels like a real count / compare timer.
434 */
435 return 1;
436}
437
394void __cpuinit mips_clockevent_init(void) 438void __cpuinit mips_clockevent_init(void)
395{ 439{
396 uint64_t mips_freq = mips_hpt_frequency; 440 uint64_t mips_freq = mips_hpt_frequency;
@@ -412,6 +456,9 @@ void __cpuinit mips_clockevent_init(void)
412 return; 456 return;
413#endif 457#endif
414 458
459 if (!c0_compare_int_usable())
460 return;
461
415 cd = &per_cpu(mips_clockevent_device, cpu); 462 cd = &per_cpu(mips_clockevent_device, cpu);
416 463
417 cd->name = "MIPS"; 464 cd->name = "MIPS";
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 9c0c478d71ac..bbf01b81a4ff 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -9,9 +9,10 @@
9 * Copyright (C) 1999 Silicon Graphics, Inc. 9 * Copyright (C) 1999 Silicon Graphics, Inc.
10 * Kevin D. Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com 10 * Kevin D. Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com
11 * Copyright (C) 2000, 01 MIPS Technologies, Inc. 11 * Copyright (C) 2000, 01 MIPS Technologies, Inc.
12 * Copyright (C) 2002, 2003, 2004, 2005 Maciej W. Rozycki 12 * Copyright (C) 2002, 2003, 2004, 2005, 2007 Maciej W. Rozycki
13 */ 13 */
14#include <linux/bug.h> 14#include <linux/bug.h>
15#include <linux/compiler.h>
15#include <linux/init.h> 16#include <linux/init.h>
16#include <linux/mm.h> 17#include <linux/mm.h>
17#include <linux/module.h> 18#include <linux/module.h>
@@ -410,7 +411,7 @@ asmlinkage void do_be(struct pt_regs *regs)
410} 411}
411 412
412/* 413/*
413 * ll/sc emulation 414 * ll/sc, rdhwr, sync emulation
414 */ 415 */
415 416
416#define OPCODE 0xfc000000 417#define OPCODE 0xfc000000
@@ -419,9 +420,11 @@ asmlinkage void do_be(struct pt_regs *regs)
419#define OFFSET 0x0000ffff 420#define OFFSET 0x0000ffff
420#define LL 0xc0000000 421#define LL 0xc0000000
421#define SC 0xe0000000 422#define SC 0xe0000000
423#define SPEC0 0x00000000
422#define SPEC3 0x7c000000 424#define SPEC3 0x7c000000
423#define RD 0x0000f800 425#define RD 0x0000f800
424#define FUNC 0x0000003f 426#define FUNC 0x0000003f
427#define SYNC 0x0000000f
425#define RDHWR 0x0000003b 428#define RDHWR 0x0000003b
426 429
427/* 430/*
@@ -432,11 +435,10 @@ unsigned long ll_bit;
432 435
433static struct task_struct *ll_task = NULL; 436static struct task_struct *ll_task = NULL;
434 437
435static inline void simulate_ll(struct pt_regs *regs, unsigned int opcode) 438static inline int simulate_ll(struct pt_regs *regs, unsigned int opcode)
436{ 439{
437 unsigned long value, __user *vaddr; 440 unsigned long value, __user *vaddr;
438 long offset; 441 long offset;
439 int signal = 0;
440 442
441 /* 443 /*
442 * analyse the ll instruction that just caused a ri exception 444 * analyse the ll instruction that just caused a ri exception
@@ -451,14 +453,10 @@ static inline void simulate_ll(struct pt_regs *regs, unsigned int opcode)
451 vaddr = (unsigned long __user *) 453 vaddr = (unsigned long __user *)
452 ((unsigned long)(regs->regs[(opcode & BASE) >> 21]) + offset); 454 ((unsigned long)(regs->regs[(opcode & BASE) >> 21]) + offset);
453 455
454 if ((unsigned long)vaddr & 3) { 456 if ((unsigned long)vaddr & 3)
455 signal = SIGBUS; 457 return SIGBUS;
456 goto sig; 458 if (get_user(value, vaddr))
457 } 459 return SIGSEGV;
458 if (get_user(value, vaddr)) {
459 signal = SIGSEGV;
460 goto sig;
461 }
462 460
463 preempt_disable(); 461 preempt_disable();
464 462
@@ -471,22 +469,16 @@ static inline void simulate_ll(struct pt_regs *regs, unsigned int opcode)
471 469
472 preempt_enable(); 470 preempt_enable();
473 471
474 compute_return_epc(regs);
475
476 regs->regs[(opcode & RT) >> 16] = value; 472 regs->regs[(opcode & RT) >> 16] = value;
477 473
478 return; 474 return 0;
479
480sig:
481 force_sig(signal, current);
482} 475}
483 476
484static inline void simulate_sc(struct pt_regs *regs, unsigned int opcode) 477static inline int simulate_sc(struct pt_regs *regs, unsigned int opcode)
485{ 478{
486 unsigned long __user *vaddr; 479 unsigned long __user *vaddr;
487 unsigned long reg; 480 unsigned long reg;
488 long offset; 481 long offset;
489 int signal = 0;
490 482
491 /* 483 /*
492 * analyse the sc instruction that just caused a ri exception 484 * analyse the sc instruction that just caused a ri exception
@@ -502,34 +494,25 @@ static inline void simulate_sc(struct pt_regs *regs, unsigned int opcode)
502 ((unsigned long)(regs->regs[(opcode & BASE) >> 21]) + offset); 494 ((unsigned long)(regs->regs[(opcode & BASE) >> 21]) + offset);
503 reg = (opcode & RT) >> 16; 495 reg = (opcode & RT) >> 16;
504 496
505 if ((unsigned long)vaddr & 3) { 497 if ((unsigned long)vaddr & 3)
506 signal = SIGBUS; 498 return SIGBUS;
507 goto sig;
508 }
509 499
510 preempt_disable(); 500 preempt_disable();
511 501
512 if (ll_bit == 0 || ll_task != current) { 502 if (ll_bit == 0 || ll_task != current) {
513 compute_return_epc(regs);
514 regs->regs[reg] = 0; 503 regs->regs[reg] = 0;
515 preempt_enable(); 504 preempt_enable();
516 return; 505 return 0;
517 } 506 }
518 507
519 preempt_enable(); 508 preempt_enable();
520 509
521 if (put_user(regs->regs[reg], vaddr)) { 510 if (put_user(regs->regs[reg], vaddr))
522 signal = SIGSEGV; 511 return SIGSEGV;
523 goto sig;
524 }
525 512
526 compute_return_epc(regs);
527 regs->regs[reg] = 1; 513 regs->regs[reg] = 1;
528 514
529 return; 515 return 0;
530
531sig:
532 force_sig(signal, current);
533} 516}
534 517
535/* 518/*
@@ -539,27 +522,14 @@ sig:
539 * few processors such as NEC's VR4100 throw reserved instruction exceptions 522 * few processors such as NEC's VR4100 throw reserved instruction exceptions
540 * instead, so we're doing the emulation thing in both exception handlers. 523 * instead, so we're doing the emulation thing in both exception handlers.
541 */ 524 */
542static inline int simulate_llsc(struct pt_regs *regs) 525static int simulate_llsc(struct pt_regs *regs, unsigned int opcode)
543{ 526{
544 unsigned int opcode; 527 if ((opcode & OPCODE) == LL)
545 528 return simulate_ll(regs, opcode);
546 if (get_user(opcode, (unsigned int __user *) exception_epc(regs))) 529 if ((opcode & OPCODE) == SC)
547 goto out_sigsegv; 530 return simulate_sc(regs, opcode);
548
549 if ((opcode & OPCODE) == LL) {
550 simulate_ll(regs, opcode);
551 return 0;
552 }
553 if ((opcode & OPCODE) == SC) {
554 simulate_sc(regs, opcode);
555 return 0;
556 }
557
558 return -EFAULT; /* Strange things going on ... */
559 531
560out_sigsegv: 532 return -1; /* Must be something else ... */
561 force_sig(SIGSEGV, current);
562 return -EFAULT;
563} 533}
564 534
565/* 535/*
@@ -567,16 +537,9 @@ out_sigsegv:
567 * registers not implemented in hardware. The only current use of this 537 * registers not implemented in hardware. The only current use of this
568 * is the thread area pointer. 538 * is the thread area pointer.
569 */ 539 */
570static inline int simulate_rdhwr(struct pt_regs *regs) 540static int simulate_rdhwr(struct pt_regs *regs, unsigned int opcode)
571{ 541{
572 struct thread_info *ti = task_thread_info(current); 542 struct thread_info *ti = task_thread_info(current);
573 unsigned int opcode;
574
575 if (get_user(opcode, (unsigned int __user *) exception_epc(regs)))
576 goto out_sigsegv;
577
578 if (unlikely(compute_return_epc(regs)))
579 return -EFAULT;
580 543
581 if ((opcode & OPCODE) == SPEC3 && (opcode & FUNC) == RDHWR) { 544 if ((opcode & OPCODE) == SPEC3 && (opcode & FUNC) == RDHWR) {
582 int rd = (opcode & RD) >> 11; 545 int rd = (opcode & RD) >> 11;
@@ -586,16 +549,20 @@ static inline int simulate_rdhwr(struct pt_regs *regs)
586 regs->regs[rt] = ti->tp_value; 549 regs->regs[rt] = ti->tp_value;
587 return 0; 550 return 0;
588 default: 551 default:
589 return -EFAULT; 552 return -1;
590 } 553 }
591 } 554 }
592 555
593 /* Not ours. */ 556 /* Not ours. */
594 return -EFAULT; 557 return -1;
558}
595 559
596out_sigsegv: 560static int simulate_sync(struct pt_regs *regs, unsigned int opcode)
597 force_sig(SIGSEGV, current); 561{
598 return -EFAULT; 562 if ((opcode & OPCODE) == SPEC0 && (opcode & FUNC) == SYNC)
563 return 0;
564
565 return -1; /* Must be something else ... */
599} 566}
600 567
601asmlinkage void do_ov(struct pt_regs *regs) 568asmlinkage void do_ov(struct pt_regs *regs)
@@ -767,16 +734,35 @@ out_sigsegv:
767 734
768asmlinkage void do_ri(struct pt_regs *regs) 735asmlinkage void do_ri(struct pt_regs *regs)
769{ 736{
770 die_if_kernel("Reserved instruction in kernel code", regs); 737 unsigned int __user *epc = (unsigned int __user *)exception_epc(regs);
738 unsigned long old_epc = regs->cp0_epc;
739 unsigned int opcode = 0;
740 int status = -1;
771 741
772 if (!cpu_has_llsc) 742 die_if_kernel("Reserved instruction in kernel code", regs);
773 if (!simulate_llsc(regs))
774 return;
775 743
776 if (!simulate_rdhwr(regs)) 744 if (unlikely(compute_return_epc(regs) < 0))
777 return; 745 return;
778 746
779 force_sig(SIGILL, current); 747 if (unlikely(get_user(opcode, epc) < 0))
748 status = SIGSEGV;
749
750 if (!cpu_has_llsc && status < 0)
751 status = simulate_llsc(regs, opcode);
752
753 if (status < 0)
754 status = simulate_rdhwr(regs, opcode);
755
756 if (status < 0)
757 status = simulate_sync(regs, opcode);
758
759 if (status < 0)
760 status = SIGILL;
761
762 if (unlikely(status > 0)) {
763 regs->cp0_epc = old_epc; /* Undo skip-over. */
764 force_sig(status, current);
765 }
780} 766}
781 767
782/* 768/*
@@ -808,7 +794,11 @@ static void mt_ase_fp_affinity(void)
808 794
809asmlinkage void do_cpu(struct pt_regs *regs) 795asmlinkage void do_cpu(struct pt_regs *regs)
810{ 796{
797 unsigned int __user *epc;
798 unsigned long old_epc;
799 unsigned int opcode;
811 unsigned int cpid; 800 unsigned int cpid;
801 int status;
812 802
813 die_if_kernel("do_cpu invoked from kernel context!", regs); 803 die_if_kernel("do_cpu invoked from kernel context!", regs);
814 804
@@ -816,14 +806,32 @@ asmlinkage void do_cpu(struct pt_regs *regs)
816 806
817 switch (cpid) { 807 switch (cpid) {
818 case 0: 808 case 0:
819 if (!cpu_has_llsc) 809 epc = (unsigned int __user *)exception_epc(regs);
820 if (!simulate_llsc(regs)) 810 old_epc = regs->cp0_epc;
821 return; 811 opcode = 0;
812 status = -1;
822 813
823 if (!simulate_rdhwr(regs)) 814 if (unlikely(compute_return_epc(regs) < 0))
824 return; 815 return;
825 816
826 break; 817 if (unlikely(get_user(opcode, epc) < 0))
818 status = SIGSEGV;
819
820 if (!cpu_has_llsc && status < 0)
821 status = simulate_llsc(regs, opcode);
822
823 if (status < 0)
824 status = simulate_rdhwr(regs, opcode);
825
826 if (status < 0)
827 status = SIGILL;
828
829 if (unlikely(status > 0)) {
830 regs->cp0_epc = old_epc; /* Undo skip-over. */
831 force_sig(status, current);
832 }
833
834 return;
827 835
828 case 1: 836 case 1:
829 if (used_math()) /* Using the FPU again. */ 837 if (used_math()) /* Using the FPU again. */
diff --git a/arch/mips/sgi-ip22/ip22-time.c b/arch/mips/sgi-ip22/ip22-time.c
index 9b9bffd2e8fb..10e505491655 100644
--- a/arch/mips/sgi-ip22/ip22-time.c
+++ b/arch/mips/sgi-ip22/ip22-time.c
@@ -192,12 +192,3 @@ void indy_8254timer_irq(void)
192 ArcEnterInteractiveMode(); 192 ArcEnterInteractiveMode();
193 irq_exit(); 193 irq_exit();
194} 194}
195
196void __init plat_timer_setup(struct irqaction *irq)
197{
198 /* over-write the handler, we use our own way */
199 irq->handler = no_action;
200
201 /* setup irqaction */
202 setup_irq(SGI_TIMER_IRQ, irq);
203}
diff --git a/arch/mips/sibyte/bcm1480/time.c b/arch/mips/sibyte/bcm1480/time.c
index 40d7126cd5bf..5b4bfbbb5a24 100644
--- a/arch/mips/sibyte/bcm1480/time.c
+++ b/arch/mips/sibyte/bcm1480/time.c
@@ -84,7 +84,7 @@ static void sibyte_set_mode(enum clock_event_mode mode,
84 void __iomem *timer_cfg, *timer_init; 84 void __iomem *timer_cfg, *timer_init;
85 85
86 timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); 86 timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
87 timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); 87 timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT));
88 88
89 switch (mode) { 89 switch (mode) {
90 case CLOCK_EVT_MODE_PERIODIC: 90 case CLOCK_EVT_MODE_PERIODIC:
diff --git a/arch/mips/sibyte/sb1250/time.c b/arch/mips/sibyte/sb1250/time.c
index 38199ad8fc54..fe11fed8e0d7 100644
--- a/arch/mips/sibyte/sb1250/time.c
+++ b/arch/mips/sibyte/sb1250/time.c
@@ -83,7 +83,7 @@ static void sibyte_set_mode(enum clock_event_mode mode,
83 void __iomem *timer_cfg, *timer_init; 83 void __iomem *timer_cfg, *timer_init;
84 84
85 timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); 85 timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
86 timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); 86 timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT));
87 87
88 switch(mode) { 88 switch(mode) {
89 case CLOCK_EVT_MODE_PERIODIC: 89 case CLOCK_EVT_MODE_PERIODIC:
@@ -111,7 +111,7 @@ sibyte_next_event(unsigned long delta, struct clock_event_device *evt)
111 void __iomem *timer_cfg, *timer_init; 111 void __iomem *timer_cfg, *timer_init;
112 112
113 timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); 113 timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
114 timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); 114 timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT));
115 115
116 __raw_writeq(0, timer_cfg); 116 __raw_writeq(0, timer_cfg);
117 __raw_writeq(delta, timer_init); 117 __raw_writeq(delta, timer_init);
@@ -155,7 +155,7 @@ static void sibyte_set_mode(enum clock_event_mode mode,
155 void __iomem *timer_cfg, *timer_init; 155 void __iomem *timer_cfg, *timer_init;
156 156
157 timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); 157 timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
158 timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); 158 timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT));
159 159
160 switch (mode) { 160 switch (mode) {
161 case CLOCK_EVT_MODE_PERIODIC: 161 case CLOCK_EVT_MODE_PERIODIC:
@@ -183,7 +183,7 @@ sibyte_next_event(unsigned long delta, struct clock_event_device *evt)
183 void __iomem *timer_cfg, *timer_init; 183 void __iomem *timer_cfg, *timer_init;
184 184
185 timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); 185 timer_cfg = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG));
186 timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_CFG)); 186 timer_init = IOADDR(A_SCD_TIMER_REGISTER(cpu, R_SCD_TIMER_INIT));
187 187
188 __raw_writeq(0, timer_cfg); 188 __raw_writeq(0, timer_cfg);
189 __raw_writeq(delta, timer_init); 189 __raw_writeq(delta, timer_init);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a3ae8e6c8b3b..3bd2688bd443 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -390,8 +390,8 @@ void apply_paravirt(struct paravirt_patch_site *start,
390 BUG_ON(p->len > MAX_PATCH_LEN); 390 BUG_ON(p->len > MAX_PATCH_LEN);
391 /* prep the buffer with the original instructions */ 391 /* prep the buffer with the original instructions */
392 memcpy(insnbuf, p->instr, p->len); 392 memcpy(insnbuf, p->instr, p->len);
393 used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf, 393 used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
394 (unsigned long)p->instr, p->len); 394 (unsigned long)p->instr, p->len);
395 395
396 BUG_ON(used > p->len); 396 BUG_ON(used > p->len);
397 397
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 8029742c0fc1..f1b7cdda82b3 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -116,12 +116,14 @@ void foo(void)
116 116
117#ifdef CONFIG_PARAVIRT 117#ifdef CONFIG_PARAVIRT
118 BLANK(); 118 BLANK();
119 OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled); 119 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
120 OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); 120 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
121 OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); 121 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
122 OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); 122 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
123 OFFSET(PARAVIRT_iret, paravirt_ops, iret); 123 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
124 OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); 124 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
125 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
126 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
125#endif 127#endif
126 128
127#ifdef CONFIG_XEN 129#ifdef CONFIG_XEN
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 8099fea0a72f..dc7f938e5015 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -437,7 +437,7 @@ ldt_ss:
437 * is still available to implement the setting of the high 437 * is still available to implement the setting of the high
438 * 16-bits in the INTERRUPT_RETURN paravirt-op. 438 * 16-bits in the INTERRUPT_RETURN paravirt-op.
439 */ 439 */
440 cmpl $0, paravirt_ops+PARAVIRT_enabled 440 cmpl $0, pv_info+PARAVIRT_enabled
441 jne restore_nocheck 441 jne restore_nocheck
442#endif 442#endif
443 443
diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c
index 739cfb207dd7..6a80d67c2121 100644
--- a/arch/x86/kernel/paravirt_32.c
+++ b/arch/x86/kernel/paravirt_32.c
@@ -42,32 +42,33 @@ void _paravirt_nop(void)
42static void __init default_banner(void) 42static void __init default_banner(void)
43{ 43{
44 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 44 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
45 paravirt_ops.name); 45 pv_info.name);
46} 46}
47 47
48char *memory_setup(void) 48char *memory_setup(void)
49{ 49{
50 return paravirt_ops.memory_setup(); 50 return pv_init_ops.memory_setup();
51} 51}
52 52
53/* Simple instruction patching code. */ 53/* Simple instruction patching code. */
54#define DEF_NATIVE(name, code) \ 54#define DEF_NATIVE(ops, name, code) \
55 extern const char start_##name[], end_##name[]; \ 55 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
56 asm("start_" #name ": " code "; end_" #name ":") 56 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
57 57
58DEF_NATIVE(irq_disable, "cli"); 58DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
59DEF_NATIVE(irq_enable, "sti"); 59DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
60DEF_NATIVE(restore_fl, "push %eax; popf"); 60DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
61DEF_NATIVE(save_fl, "pushf; pop %eax"); 61DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
62DEF_NATIVE(iret, "iret"); 62DEF_NATIVE(pv_cpu_ops, iret, "iret");
63DEF_NATIVE(irq_enable_sysexit, "sti; sysexit"); 63DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit");
64DEF_NATIVE(read_cr2, "mov %cr2, %eax"); 64DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
65DEF_NATIVE(write_cr3, "mov %eax, %cr3"); 65DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
66DEF_NATIVE(read_cr3, "mov %cr3, %eax"); 66DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
67DEF_NATIVE(clts, "clts"); 67DEF_NATIVE(pv_cpu_ops, clts, "clts");
68DEF_NATIVE(read_tsc, "rdtsc"); 68DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
69 69
70DEF_NATIVE(ud2a, "ud2a"); 70/* Undefined instruction for dealing with missing ops pointers. */
71static const unsigned char ud2a[] = { 0x0f, 0x0b };
71 72
72static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 73static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
73 unsigned long addr, unsigned len) 74 unsigned long addr, unsigned len)
@@ -76,37 +77,29 @@ static unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
76 unsigned ret; 77 unsigned ret;
77 78
78 switch(type) { 79 switch(type) {
79#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site 80#define SITE(ops, x) \
80 SITE(irq_disable); 81 case PARAVIRT_PATCH(ops.x): \
81 SITE(irq_enable); 82 start = start_##ops##_##x; \
82 SITE(restore_fl); 83 end = end_##ops##_##x; \
83 SITE(save_fl); 84 goto patch_site
84 SITE(iret); 85
85 SITE(irq_enable_sysexit); 86 SITE(pv_irq_ops, irq_disable);
86 SITE(read_cr2); 87 SITE(pv_irq_ops, irq_enable);
87 SITE(read_cr3); 88 SITE(pv_irq_ops, restore_fl);
88 SITE(write_cr3); 89 SITE(pv_irq_ops, save_fl);
89 SITE(clts); 90 SITE(pv_cpu_ops, iret);
90 SITE(read_tsc); 91 SITE(pv_cpu_ops, irq_enable_sysexit);
92 SITE(pv_mmu_ops, read_cr2);
93 SITE(pv_mmu_ops, read_cr3);
94 SITE(pv_mmu_ops, write_cr3);
95 SITE(pv_cpu_ops, clts);
96 SITE(pv_cpu_ops, read_tsc);
91#undef SITE 97#undef SITE
92 98
93 patch_site: 99 patch_site:
94 ret = paravirt_patch_insns(ibuf, len, start, end); 100 ret = paravirt_patch_insns(ibuf, len, start, end);
95 break; 101 break;
96 102
97 case PARAVIRT_PATCH(make_pgd):
98 case PARAVIRT_PATCH(make_pte):
99 case PARAVIRT_PATCH(pgd_val):
100 case PARAVIRT_PATCH(pte_val):
101#ifdef CONFIG_X86_PAE
102 case PARAVIRT_PATCH(make_pmd):
103 case PARAVIRT_PATCH(pmd_val):
104#endif
105 /* These functions end up returning exactly what
106 they're passed, in the same registers. */
107 ret = paravirt_patch_nop();
108 break;
109
110 default: 103 default:
111 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); 104 ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
112 break; 105 break;
@@ -150,7 +143,7 @@ unsigned paravirt_patch_call(void *insnbuf,
150 return 5; 143 return 5;
151} 144}
152 145
153unsigned paravirt_patch_jmp(const void *target, void *insnbuf, 146unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
154 unsigned long addr, unsigned len) 147 unsigned long addr, unsigned len)
155{ 148{
156 struct branch *b = insnbuf; 149 struct branch *b = insnbuf;
@@ -165,22 +158,37 @@ unsigned paravirt_patch_jmp(const void *target, void *insnbuf,
165 return 5; 158 return 5;
166} 159}
167 160
161/* Neat trick to map patch type back to the call within the
162 * corresponding structure. */
163static void *get_call_destination(u8 type)
164{
165 struct paravirt_patch_template tmpl = {
166 .pv_init_ops = pv_init_ops,
167 .pv_time_ops = pv_time_ops,
168 .pv_cpu_ops = pv_cpu_ops,
169 .pv_irq_ops = pv_irq_ops,
170 .pv_apic_ops = pv_apic_ops,
171 .pv_mmu_ops = pv_mmu_ops,
172 };
173 return *((void **)&tmpl + type);
174}
175
168unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, 176unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
169 unsigned long addr, unsigned len) 177 unsigned long addr, unsigned len)
170{ 178{
171 void *opfunc = *((void **)&paravirt_ops + type); 179 void *opfunc = get_call_destination(type);
172 unsigned ret; 180 unsigned ret;
173 181
174 if (opfunc == NULL) 182 if (opfunc == NULL)
175 /* If there's no function, patch it with a ud2a (BUG) */ 183 /* If there's no function, patch it with a ud2a (BUG) */
176 ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a); 184 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
177 else if (opfunc == paravirt_nop) 185 else if (opfunc == paravirt_nop)
178 /* If the operation is a nop, then nop the callsite */ 186 /* If the operation is a nop, then nop the callsite */
179 ret = paravirt_patch_nop(); 187 ret = paravirt_patch_nop();
180 else if (type == PARAVIRT_PATCH(iret) || 188 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
181 type == PARAVIRT_PATCH(irq_enable_sysexit)) 189 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit))
182 /* If operation requires a jmp, then jmp */ 190 /* If operation requires a jmp, then jmp */
183 ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len); 191 ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
184 else 192 else
185 /* Otherwise call the function; assume target could 193 /* Otherwise call the function; assume target could
186 clobber any caller-save reg */ 194 clobber any caller-save reg */
@@ -205,7 +213,7 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
205 213
206void init_IRQ(void) 214void init_IRQ(void)
207{ 215{
208 paravirt_ops.init_IRQ(); 216 pv_irq_ops.init_IRQ();
209} 217}
210 218
211static void native_flush_tlb(void) 219static void native_flush_tlb(void)
@@ -233,7 +241,7 @@ extern void native_irq_enable_sysexit(void);
233 241
234static int __init print_banner(void) 242static int __init print_banner(void)
235{ 243{
236 paravirt_ops.banner(); 244 pv_init_ops.banner();
237 return 0; 245 return 0;
238} 246}
239core_initcall(print_banner); 247core_initcall(print_banner);
@@ -273,47 +281,96 @@ int paravirt_disable_iospace(void)
273 return ret; 281 return ret;
274} 282}
275 283
276struct paravirt_ops paravirt_ops = { 284static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
285
286static inline void enter_lazy(enum paravirt_lazy_mode mode)
287{
288 BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
289 BUG_ON(preemptible());
290
291 x86_write_percpu(paravirt_lazy_mode, mode);
292}
293
294void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
295{
296 BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode);
297 BUG_ON(preemptible());
298
299 x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
300}
301
302void paravirt_enter_lazy_mmu(void)
303{
304 enter_lazy(PARAVIRT_LAZY_MMU);
305}
306
307void paravirt_leave_lazy_mmu(void)
308{
309 paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
310}
311
312void paravirt_enter_lazy_cpu(void)
313{
314 enter_lazy(PARAVIRT_LAZY_CPU);
315}
316
317void paravirt_leave_lazy_cpu(void)
318{
319 paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
320}
321
322enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
323{
324 return x86_read_percpu(paravirt_lazy_mode);
325}
326
327struct pv_info pv_info = {
277 .name = "bare hardware", 328 .name = "bare hardware",
278 .paravirt_enabled = 0, 329 .paravirt_enabled = 0,
279 .kernel_rpl = 0, 330 .kernel_rpl = 0,
280 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ 331 .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
332};
281 333
282 .patch = native_patch, 334struct pv_init_ops pv_init_ops = {
335 .patch = native_patch,
283 .banner = default_banner, 336 .banner = default_banner,
284 .arch_setup = paravirt_nop, 337 .arch_setup = paravirt_nop,
285 .memory_setup = machine_specific_memory_setup, 338 .memory_setup = machine_specific_memory_setup,
339};
340
341struct pv_time_ops pv_time_ops = {
342 .time_init = hpet_time_init,
286 .get_wallclock = native_get_wallclock, 343 .get_wallclock = native_get_wallclock,
287 .set_wallclock = native_set_wallclock, 344 .set_wallclock = native_set_wallclock,
288 .time_init = hpet_time_init, 345 .sched_clock = native_sched_clock,
346 .get_cpu_khz = native_calculate_cpu_khz,
347};
348
349struct pv_irq_ops pv_irq_ops = {
289 .init_IRQ = native_init_IRQ, 350 .init_IRQ = native_init_IRQ,
351 .save_fl = native_save_fl,
352 .restore_fl = native_restore_fl,
353 .irq_disable = native_irq_disable,
354 .irq_enable = native_irq_enable,
355 .safe_halt = native_safe_halt,
356 .halt = native_halt,
357};
290 358
359struct pv_cpu_ops pv_cpu_ops = {
291 .cpuid = native_cpuid, 360 .cpuid = native_cpuid,
292 .get_debugreg = native_get_debugreg, 361 .get_debugreg = native_get_debugreg,
293 .set_debugreg = native_set_debugreg, 362 .set_debugreg = native_set_debugreg,
294 .clts = native_clts, 363 .clts = native_clts,
295 .read_cr0 = native_read_cr0, 364 .read_cr0 = native_read_cr0,
296 .write_cr0 = native_write_cr0, 365 .write_cr0 = native_write_cr0,
297 .read_cr2 = native_read_cr2,
298 .write_cr2 = native_write_cr2,
299 .read_cr3 = native_read_cr3,
300 .write_cr3 = native_write_cr3,
301 .read_cr4 = native_read_cr4, 366 .read_cr4 = native_read_cr4,
302 .read_cr4_safe = native_read_cr4_safe, 367 .read_cr4_safe = native_read_cr4_safe,
303 .write_cr4 = native_write_cr4, 368 .write_cr4 = native_write_cr4,
304 .save_fl = native_save_fl,
305 .restore_fl = native_restore_fl,
306 .irq_disable = native_irq_disable,
307 .irq_enable = native_irq_enable,
308 .safe_halt = native_safe_halt,
309 .halt = native_halt,
310 .wbinvd = native_wbinvd, 369 .wbinvd = native_wbinvd,
311 .read_msr = native_read_msr_safe, 370 .read_msr = native_read_msr_safe,
312 .write_msr = native_write_msr_safe, 371 .write_msr = native_write_msr_safe,
313 .read_tsc = native_read_tsc, 372 .read_tsc = native_read_tsc,
314 .read_pmc = native_read_pmc, 373 .read_pmc = native_read_pmc,
315 .sched_clock = native_sched_clock,
316 .get_cpu_khz = native_calculate_cpu_khz,
317 .load_tr_desc = native_load_tr_desc, 374 .load_tr_desc = native_load_tr_desc,
318 .set_ldt = native_set_ldt, 375 .set_ldt = native_set_ldt,
319 .load_gdt = native_load_gdt, 376 .load_gdt = native_load_gdt,
@@ -327,9 +384,19 @@ struct paravirt_ops paravirt_ops = {
327 .write_idt_entry = write_dt_entry, 384 .write_idt_entry = write_dt_entry,
328 .load_esp0 = native_load_esp0, 385 .load_esp0 = native_load_esp0,
329 386
387 .irq_enable_sysexit = native_irq_enable_sysexit,
388 .iret = native_iret,
389
330 .set_iopl_mask = native_set_iopl_mask, 390 .set_iopl_mask = native_set_iopl_mask,
331 .io_delay = native_io_delay, 391 .io_delay = native_io_delay,
332 392
393 .lazy_mode = {
394 .enter = paravirt_nop,
395 .leave = paravirt_nop,
396 },
397};
398
399struct pv_apic_ops pv_apic_ops = {
333#ifdef CONFIG_X86_LOCAL_APIC 400#ifdef CONFIG_X86_LOCAL_APIC
334 .apic_write = native_apic_write, 401 .apic_write = native_apic_write,
335 .apic_write_atomic = native_apic_write_atomic, 402 .apic_write_atomic = native_apic_write_atomic,
@@ -338,11 +405,17 @@ struct paravirt_ops paravirt_ops = {
338 .setup_secondary_clock = setup_secondary_APIC_clock, 405 .setup_secondary_clock = setup_secondary_APIC_clock,
339 .startup_ipi_hook = paravirt_nop, 406 .startup_ipi_hook = paravirt_nop,
340#endif 407#endif
341 .set_lazy_mode = paravirt_nop, 408};
342 409
410struct pv_mmu_ops pv_mmu_ops = {
343 .pagetable_setup_start = native_pagetable_setup_start, 411 .pagetable_setup_start = native_pagetable_setup_start,
344 .pagetable_setup_done = native_pagetable_setup_done, 412 .pagetable_setup_done = native_pagetable_setup_done,
345 413
414 .read_cr2 = native_read_cr2,
415 .write_cr2 = native_write_cr2,
416 .read_cr3 = native_read_cr3,
417 .write_cr3 = native_write_cr3,
418
346 .flush_tlb_user = native_flush_tlb, 419 .flush_tlb_user = native_flush_tlb,
347 .flush_tlb_kernel = native_flush_tlb_global, 420 .flush_tlb_kernel = native_flush_tlb_global,
348 .flush_tlb_single = native_flush_tlb_single, 421 .flush_tlb_single = native_flush_tlb_single,
@@ -381,12 +454,19 @@ struct paravirt_ops paravirt_ops = {
381 .make_pte = native_make_pte, 454 .make_pte = native_make_pte,
382 .make_pgd = native_make_pgd, 455 .make_pgd = native_make_pgd,
383 456
384 .irq_enable_sysexit = native_irq_enable_sysexit,
385 .iret = native_iret,
386
387 .dup_mmap = paravirt_nop, 457 .dup_mmap = paravirt_nop,
388 .exit_mmap = paravirt_nop, 458 .exit_mmap = paravirt_nop,
389 .activate_mm = paravirt_nop, 459 .activate_mm = paravirt_nop,
460
461 .lazy_mode = {
462 .enter = paravirt_nop,
463 .leave = paravirt_nop,
464 },
390}; 465};
391 466
392EXPORT_SYMBOL(paravirt_ops); 467EXPORT_SYMBOL_GPL(pv_time_ops);
468EXPORT_SYMBOL_GPL(pv_cpu_ops);
469EXPORT_SYMBOL_GPL(pv_mmu_ops);
470EXPORT_SYMBOL_GPL(pv_apic_ops);
471EXPORT_SYMBOL_GPL(pv_info);
472EXPORT_SYMBOL (pv_irq_ops);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 18673e0f193b..f02bad68abaa 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -134,21 +134,21 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
134 unsigned long eip, unsigned len) 134 unsigned long eip, unsigned len)
135{ 135{
136 switch (type) { 136 switch (type) {
137 case PARAVIRT_PATCH(irq_disable): 137 case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
138 return patch_internal(VMI_CALL_DisableInterrupts, len, 138 return patch_internal(VMI_CALL_DisableInterrupts, len,
139 insns, eip); 139 insns, eip);
140 case PARAVIRT_PATCH(irq_enable): 140 case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
141 return patch_internal(VMI_CALL_EnableInterrupts, len, 141 return patch_internal(VMI_CALL_EnableInterrupts, len,
142 insns, eip); 142 insns, eip);
143 case PARAVIRT_PATCH(restore_fl): 143 case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
144 return patch_internal(VMI_CALL_SetInterruptMask, len, 144 return patch_internal(VMI_CALL_SetInterruptMask, len,
145 insns, eip); 145 insns, eip);
146 case PARAVIRT_PATCH(save_fl): 146 case PARAVIRT_PATCH(pv_irq_ops.save_fl):
147 return patch_internal(VMI_CALL_GetInterruptMask, len, 147 return patch_internal(VMI_CALL_GetInterruptMask, len,
148 insns, eip); 148 insns, eip);
149 case PARAVIRT_PATCH(iret): 149 case PARAVIRT_PATCH(pv_cpu_ops.iret):
150 return patch_internal(VMI_CALL_IRET, len, insns, eip); 150 return patch_internal(VMI_CALL_IRET, len, insns, eip);
151 case PARAVIRT_PATCH(irq_enable_sysexit): 151 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
152 return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); 152 return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip);
153 default: 153 default:
154 break; 154 break;
@@ -552,24 +552,22 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
552} 552}
553#endif 553#endif
554 554
555static void vmi_set_lazy_mode(enum paravirt_lazy_mode mode) 555static void vmi_enter_lazy_cpu(void)
556{ 556{
557 static DEFINE_PER_CPU(enum paravirt_lazy_mode, lazy_mode); 557 paravirt_enter_lazy_cpu();
558 558 vmi_ops.set_lazy_mode(2);
559 if (!vmi_ops.set_lazy_mode) 559}
560 return;
561 560
562 /* Modes should never nest or overlap */ 561static void vmi_enter_lazy_mmu(void)
563 BUG_ON(__get_cpu_var(lazy_mode) && !(mode == PARAVIRT_LAZY_NONE || 562{
564 mode == PARAVIRT_LAZY_FLUSH)); 563 paravirt_enter_lazy_mmu();
564 vmi_ops.set_lazy_mode(1);
565}
565 566
566 if (mode == PARAVIRT_LAZY_FLUSH) { 567static void vmi_leave_lazy(void)
567 vmi_ops.set_lazy_mode(0); 568{
568 vmi_ops.set_lazy_mode(__get_cpu_var(lazy_mode)); 569 paravirt_leave_lazy(paravirt_get_lazy_mode());
569 } else { 570 vmi_ops.set_lazy_mode(0);
570 vmi_ops.set_lazy_mode(mode);
571 __get_cpu_var(lazy_mode) = mode;
572 }
573} 571}
574 572
575static inline int __init check_vmi_rom(struct vrom_header *rom) 573static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -690,9 +688,9 @@ do { \
690 reloc = call_vrom_long_func(vmi_rom, get_reloc, \ 688 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
691 VMI_CALL_##vmicall); \ 689 VMI_CALL_##vmicall); \
692 if (rel->type == VMI_RELOCATION_CALL_REL) \ 690 if (rel->type == VMI_RELOCATION_CALL_REL) \
693 paravirt_ops.opname = (void *)rel->eip; \ 691 opname = (void *)rel->eip; \
694 else if (rel->type == VMI_RELOCATION_NOP) \ 692 else if (rel->type == VMI_RELOCATION_NOP) \
695 paravirt_ops.opname = (void *)vmi_nop; \ 693 opname = (void *)vmi_nop; \
696 else if (rel->type != VMI_RELOCATION_NONE) \ 694 else if (rel->type != VMI_RELOCATION_NONE) \
697 printk(KERN_WARNING "VMI: Unknown relocation " \ 695 printk(KERN_WARNING "VMI: Unknown relocation " \
698 "type %d for " #vmicall"\n",\ 696 "type %d for " #vmicall"\n",\
@@ -712,7 +710,7 @@ do { \
712 VMI_CALL_##vmicall); \ 710 VMI_CALL_##vmicall); \
713 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \ 711 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
714 if (rel->type == VMI_RELOCATION_CALL_REL) { \ 712 if (rel->type == VMI_RELOCATION_CALL_REL) { \
715 paravirt_ops.opname = wrapper; \ 713 opname = wrapper; \
716 vmi_ops.cache = (void *)rel->eip; \ 714 vmi_ops.cache = (void *)rel->eip; \
717 } \ 715 } \
718} while (0) 716} while (0)
@@ -732,11 +730,11 @@ static inline int __init activate_vmi(void)
732 } 730 }
733 savesegment(cs, kernel_cs); 731 savesegment(cs, kernel_cs);
734 732
735 paravirt_ops.paravirt_enabled = 1; 733 pv_info.paravirt_enabled = 1;
736 paravirt_ops.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; 734 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
735 pv_info.name = "vmi";
737 736
738 paravirt_ops.patch = vmi_patch; 737 pv_init_ops.patch = vmi_patch;
739 paravirt_ops.name = "vmi";
740 738
741 /* 739 /*
742 * Many of these operations are ABI compatible with VMI. 740 * Many of these operations are ABI compatible with VMI.
@@ -754,26 +752,26 @@ static inline int __init activate_vmi(void)
754 */ 752 */
755 753
756 /* CPUID is special, so very special it gets wrapped like a present */ 754 /* CPUID is special, so very special it gets wrapped like a present */
757 para_wrap(cpuid, vmi_cpuid, cpuid, CPUID); 755 para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
758 756
759 para_fill(clts, CLTS); 757 para_fill(pv_cpu_ops.clts, CLTS);
760 para_fill(get_debugreg, GetDR); 758 para_fill(pv_cpu_ops.get_debugreg, GetDR);
761 para_fill(set_debugreg, SetDR); 759 para_fill(pv_cpu_ops.set_debugreg, SetDR);
762 para_fill(read_cr0, GetCR0); 760 para_fill(pv_cpu_ops.read_cr0, GetCR0);
763 para_fill(read_cr2, GetCR2); 761 para_fill(pv_mmu_ops.read_cr2, GetCR2);
764 para_fill(read_cr3, GetCR3); 762 para_fill(pv_mmu_ops.read_cr3, GetCR3);
765 para_fill(read_cr4, GetCR4); 763 para_fill(pv_cpu_ops.read_cr4, GetCR4);
766 para_fill(write_cr0, SetCR0); 764 para_fill(pv_cpu_ops.write_cr0, SetCR0);
767 para_fill(write_cr2, SetCR2); 765 para_fill(pv_mmu_ops.write_cr2, SetCR2);
768 para_fill(write_cr3, SetCR3); 766 para_fill(pv_mmu_ops.write_cr3, SetCR3);
769 para_fill(write_cr4, SetCR4); 767 para_fill(pv_cpu_ops.write_cr4, SetCR4);
770 para_fill(save_fl, GetInterruptMask); 768 para_fill(pv_irq_ops.save_fl, GetInterruptMask);
771 para_fill(restore_fl, SetInterruptMask); 769 para_fill(pv_irq_ops.restore_fl, SetInterruptMask);
772 para_fill(irq_disable, DisableInterrupts); 770 para_fill(pv_irq_ops.irq_disable, DisableInterrupts);
773 para_fill(irq_enable, EnableInterrupts); 771 para_fill(pv_irq_ops.irq_enable, EnableInterrupts);
774 772
775 para_fill(wbinvd, WBINVD); 773 para_fill(pv_cpu_ops.wbinvd, WBINVD);
776 para_fill(read_tsc, RDTSC); 774 para_fill(pv_cpu_ops.read_tsc, RDTSC);
777 775
778 /* The following we emulate with trap and emulate for now */ 776 /* The following we emulate with trap and emulate for now */
779 /* paravirt_ops.read_msr = vmi_rdmsr */ 777 /* paravirt_ops.read_msr = vmi_rdmsr */
@@ -781,29 +779,38 @@ static inline int __init activate_vmi(void)
781 /* paravirt_ops.rdpmc = vmi_rdpmc */ 779 /* paravirt_ops.rdpmc = vmi_rdpmc */
782 780
783 /* TR interface doesn't pass TR value, wrap */ 781 /* TR interface doesn't pass TR value, wrap */
784 para_wrap(load_tr_desc, vmi_set_tr, set_tr, SetTR); 782 para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
785 783
786 /* LDT is special, too */ 784 /* LDT is special, too */
787 para_wrap(set_ldt, vmi_set_ldt, _set_ldt, SetLDT); 785 para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
788 786
789 para_fill(load_gdt, SetGDT); 787 para_fill(pv_cpu_ops.load_gdt, SetGDT);
790 para_fill(load_idt, SetIDT); 788 para_fill(pv_cpu_ops.load_idt, SetIDT);
791 para_fill(store_gdt, GetGDT); 789 para_fill(pv_cpu_ops.store_gdt, GetGDT);
792 para_fill(store_idt, GetIDT); 790 para_fill(pv_cpu_ops.store_idt, GetIDT);
793 para_fill(store_tr, GetTR); 791 para_fill(pv_cpu_ops.store_tr, GetTR);
794 paravirt_ops.load_tls = vmi_load_tls; 792 pv_cpu_ops.load_tls = vmi_load_tls;
795 para_fill(write_ldt_entry, WriteLDTEntry); 793 para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry);
796 para_fill(write_gdt_entry, WriteGDTEntry); 794 para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry);
797 para_fill(write_idt_entry, WriteIDTEntry); 795 para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry);
798 para_wrap(load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); 796 para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack);
799 para_fill(set_iopl_mask, SetIOPLMask); 797 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
800 para_fill(io_delay, IODelay); 798 para_fill(pv_cpu_ops.io_delay, IODelay);
801 para_wrap(set_lazy_mode, vmi_set_lazy_mode, set_lazy_mode, SetLazyMode); 799
800 para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
801 set_lazy_mode, SetLazyMode);
802 para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
803 set_lazy_mode, SetLazyMode);
804
805 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
806 set_lazy_mode, SetLazyMode);
807 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
808 set_lazy_mode, SetLazyMode);
802 809
803 /* user and kernel flush are just handled with different flags to FlushTLB */ 810 /* user and kernel flush are just handled with different flags to FlushTLB */
804 para_wrap(flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB); 811 para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
805 para_wrap(flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB); 812 para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
806 para_fill(flush_tlb_single, InvalPage); 813 para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
807 814
808 /* 815 /*
809 * Until a standard flag format can be agreed on, we need to 816 * Until a standard flag format can be agreed on, we need to
@@ -819,41 +826,41 @@ static inline int __init activate_vmi(void)
819#endif 826#endif
820 827
821 if (vmi_ops.set_pte) { 828 if (vmi_ops.set_pte) {
822 paravirt_ops.set_pte = vmi_set_pte; 829 pv_mmu_ops.set_pte = vmi_set_pte;
823 paravirt_ops.set_pte_at = vmi_set_pte_at; 830 pv_mmu_ops.set_pte_at = vmi_set_pte_at;
824 paravirt_ops.set_pmd = vmi_set_pmd; 831 pv_mmu_ops.set_pmd = vmi_set_pmd;
825#ifdef CONFIG_X86_PAE 832#ifdef CONFIG_X86_PAE
826 paravirt_ops.set_pte_atomic = vmi_set_pte_atomic; 833 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
827 paravirt_ops.set_pte_present = vmi_set_pte_present; 834 pv_mmu_ops.set_pte_present = vmi_set_pte_present;
828 paravirt_ops.set_pud = vmi_set_pud; 835 pv_mmu_ops.set_pud = vmi_set_pud;
829 paravirt_ops.pte_clear = vmi_pte_clear; 836 pv_mmu_ops.pte_clear = vmi_pte_clear;
830 paravirt_ops.pmd_clear = vmi_pmd_clear; 837 pv_mmu_ops.pmd_clear = vmi_pmd_clear;
831#endif 838#endif
832 } 839 }
833 840
834 if (vmi_ops.update_pte) { 841 if (vmi_ops.update_pte) {
835 paravirt_ops.pte_update = vmi_update_pte; 842 pv_mmu_ops.pte_update = vmi_update_pte;
836 paravirt_ops.pte_update_defer = vmi_update_pte_defer; 843 pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
837 } 844 }
838 845
839 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); 846 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
840 if (vmi_ops.allocate_page) { 847 if (vmi_ops.allocate_page) {
841 paravirt_ops.alloc_pt = vmi_allocate_pt; 848 pv_mmu_ops.alloc_pt = vmi_allocate_pt;
842 paravirt_ops.alloc_pd = vmi_allocate_pd; 849 pv_mmu_ops.alloc_pd = vmi_allocate_pd;
843 paravirt_ops.alloc_pd_clone = vmi_allocate_pd_clone; 850 pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone;
844 } 851 }
845 852
846 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); 853 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
847 if (vmi_ops.release_page) { 854 if (vmi_ops.release_page) {
848 paravirt_ops.release_pt = vmi_release_pt; 855 pv_mmu_ops.release_pt = vmi_release_pt;
849 paravirt_ops.release_pd = vmi_release_pd; 856 pv_mmu_ops.release_pd = vmi_release_pd;
850 } 857 }
851 858
852 /* Set linear is needed in all cases */ 859 /* Set linear is needed in all cases */
853 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); 860 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
854#ifdef CONFIG_HIGHPTE 861#ifdef CONFIG_HIGHPTE
855 if (vmi_ops.set_linear_mapping) 862 if (vmi_ops.set_linear_mapping)
856 paravirt_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; 863 pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
857#endif 864#endif
858 865
859 /* 866 /*
@@ -863,17 +870,17 @@ static inline int __init activate_vmi(void)
863 * the backend. They are performance critical anyway, so requiring 870 * the backend. They are performance critical anyway, so requiring
864 * a patch is not a big problem. 871 * a patch is not a big problem.
865 */ 872 */
866 paravirt_ops.irq_enable_sysexit = (void *)0xfeedbab0; 873 pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
867 paravirt_ops.iret = (void *)0xbadbab0; 874 pv_cpu_ops.iret = (void *)0xbadbab0;
868 875
869#ifdef CONFIG_SMP 876#ifdef CONFIG_SMP
870 para_wrap(startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState); 877 para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
871#endif 878#endif
872 879
873#ifdef CONFIG_X86_LOCAL_APIC 880#ifdef CONFIG_X86_LOCAL_APIC
874 para_fill(apic_read, APICRead); 881 para_fill(pv_apic_ops.apic_read, APICRead);
875 para_fill(apic_write, APICWrite); 882 para_fill(pv_apic_ops.apic_write, APICWrite);
876 para_fill(apic_write_atomic, APICWrite); 883 para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
877#endif 884#endif
878 885
879 /* 886 /*
@@ -891,15 +898,15 @@ static inline int __init activate_vmi(void)
891 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); 898 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
892 vmi_timer_ops.cancel_alarm = 899 vmi_timer_ops.cancel_alarm =
893 vmi_get_function(VMI_CALL_CancelAlarm); 900 vmi_get_function(VMI_CALL_CancelAlarm);
894 paravirt_ops.time_init = vmi_time_init; 901 pv_time_ops.time_init = vmi_time_init;
895 paravirt_ops.get_wallclock = vmi_get_wallclock; 902 pv_time_ops.get_wallclock = vmi_get_wallclock;
896 paravirt_ops.set_wallclock = vmi_set_wallclock; 903 pv_time_ops.set_wallclock = vmi_set_wallclock;
897#ifdef CONFIG_X86_LOCAL_APIC 904#ifdef CONFIG_X86_LOCAL_APIC
898 paravirt_ops.setup_boot_clock = vmi_time_bsp_init; 905 pv_apic_ops.setup_boot_clock = vmi_time_bsp_init;
899 paravirt_ops.setup_secondary_clock = vmi_time_ap_init; 906 pv_apic_ops.setup_secondary_clock = vmi_time_ap_init;
900#endif 907#endif
901 paravirt_ops.sched_clock = vmi_sched_clock; 908 pv_time_ops.sched_clock = vmi_sched_clock;
902 paravirt_ops.get_cpu_khz = vmi_cpu_khz; 909 pv_time_ops.get_cpu_khz = vmi_cpu_khz;
903 910
904 /* We have true wallclock functions; disable CMOS clock sync */ 911 /* We have true wallclock functions; disable CMOS clock sync */
905 no_sync_cmos_clock = 1; 912 no_sync_cmos_clock = 1;
@@ -908,7 +915,7 @@ static inline int __init activate_vmi(void)
908 disable_vmi_timer = 1; 915 disable_vmi_timer = 1;
909 } 916 }
910 917
911 para_fill(safe_halt, Halt); 918 para_fill(pv_irq_ops.safe_halt, Halt);
912 919
913 /* 920 /*
914 * Alternative instruction rewriting doesn't happen soon enough 921 * Alternative instruction rewriting doesn't happen soon enough
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index e4e37d4f4c52..c7d19471261d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -748,24 +748,12 @@ struct kmem_cache *pmd_cache;
748 748
749void __init pgtable_cache_init(void) 749void __init pgtable_cache_init(void)
750{ 750{
751 size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); 751 if (PTRS_PER_PMD > 1)
752
753 if (PTRS_PER_PMD > 1) {
754 pmd_cache = kmem_cache_create("pmd", 752 pmd_cache = kmem_cache_create("pmd",
755 PTRS_PER_PMD*sizeof(pmd_t), 753 PTRS_PER_PMD*sizeof(pmd_t),
756 PTRS_PER_PMD*sizeof(pmd_t), 754 PTRS_PER_PMD*sizeof(pmd_t),
757 SLAB_PANIC, 755 SLAB_PANIC,
758 pmd_ctor); 756 pmd_ctor);
759 if (!SHARED_KERNEL_PMD) {
760 /* If we're in PAE mode and have a non-shared
761 kernel pmd, then the pgd size must be a
762 page size. This is because the pgd_list
763 links through the page structure, so there
764 can only be one pgd per page for this to
765 work. */
766 pgd_size = PAGE_SIZE;
767 }
768 }
769} 757}
770 758
771/* 759/*
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 265f7dd3234b..94c39aaf695f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -51,11 +51,25 @@
51 51
52EXPORT_SYMBOL_GPL(hypercall_page); 52EXPORT_SYMBOL_GPL(hypercall_page);
53 53
54DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
55
56DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 54DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
57DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 55DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
58DEFINE_PER_CPU(unsigned long, xen_cr3); 56
57/*
58 * Note about cr3 (pagetable base) values:
59 *
60 * xen_cr3 contains the current logical cr3 value; it contains the
61 * last set cr3. This may not be the current effective cr3, because
62 * its update may be being lazily deferred. However, a vcpu looking
63 * at its own cr3 can use this value knowing that it everything will
64 * be self-consistent.
65 *
66 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
67 * hypercall to set the vcpu cr3 is complete (so it may be a little
68 * out of date, but it will never be set early). If one vcpu is
69 * looking at another vcpu's cr3 value, it should use this variable.
70 */
71DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
72DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
59 73
60struct start_info *xen_start_info; 74struct start_info *xen_start_info;
61EXPORT_SYMBOL_GPL(xen_start_info); 75EXPORT_SYMBOL_GPL(xen_start_info);
@@ -99,7 +113,7 @@ static void __init xen_vcpu_setup(int cpu)
99 info.mfn = virt_to_mfn(vcpup); 113 info.mfn = virt_to_mfn(vcpup);
100 info.offset = offset_in_page(vcpup); 114 info.offset = offset_in_page(vcpup);
101 115
102 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", 116 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
103 cpu, vcpup, info.mfn, info.offset); 117 cpu, vcpup, info.mfn, info.offset);
104 118
105 /* Check to see if the hypervisor will put the vcpu_info 119 /* Check to see if the hypervisor will put the vcpu_info
@@ -123,7 +137,7 @@ static void __init xen_vcpu_setup(int cpu)
123static void __init xen_banner(void) 137static void __init xen_banner(void)
124{ 138{
125 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 139 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
126 paravirt_ops.name); 140 pv_info.name);
127 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); 141 printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
128} 142}
129 143
@@ -248,29 +262,10 @@ static void xen_halt(void)
248 xen_safe_halt(); 262 xen_safe_halt();
249} 263}
250 264
251static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) 265static void xen_leave_lazy(void)
252{ 266{
253 BUG_ON(preemptible()); 267 paravirt_leave_lazy(paravirt_get_lazy_mode());
254
255 switch (mode) {
256 case PARAVIRT_LAZY_NONE:
257 BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
258 break;
259
260 case PARAVIRT_LAZY_MMU:
261 case PARAVIRT_LAZY_CPU:
262 BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE);
263 break;
264
265 case PARAVIRT_LAZY_FLUSH:
266 /* flush if necessary, but don't change state */
267 if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE)
268 xen_mc_flush();
269 return;
270 }
271
272 xen_mc_flush(); 268 xen_mc_flush();
273 x86_write_percpu(xen_lazy_mode, mode);
274} 269}
275 270
276static unsigned long xen_store_tr(void) 271static unsigned long xen_store_tr(void)
@@ -357,7 +352,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
357 * loaded properly. This will go away as soon as Xen has been 352 * loaded properly. This will go away as soon as Xen has been
358 * modified to not save/restore %gs for normal hypercalls. 353 * modified to not save/restore %gs for normal hypercalls.
359 */ 354 */
360 if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) 355 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
361 loadsegment(gs, 0); 356 loadsegment(gs, 0);
362} 357}
363 358
@@ -631,32 +626,36 @@ static unsigned long xen_read_cr3(void)
631 return x86_read_percpu(xen_cr3); 626 return x86_read_percpu(xen_cr3);
632} 627}
633 628
629static void set_current_cr3(void *v)
630{
631 x86_write_percpu(xen_current_cr3, (unsigned long)v);
632}
633
634static void xen_write_cr3(unsigned long cr3) 634static void xen_write_cr3(unsigned long cr3)
635{ 635{
636 struct mmuext_op *op;
637 struct multicall_space mcs;
638 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
639
636 BUG_ON(preemptible()); 640 BUG_ON(preemptible());
637 641
638 if (cr3 == x86_read_percpu(xen_cr3)) { 642 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */
639 /* just a simple tlb flush */
640 xen_flush_tlb();
641 return;
642 }
643 643
644 /* Update while interrupts are disabled, so its atomic with
645 respect to ipis */
644 x86_write_percpu(xen_cr3, cr3); 646 x86_write_percpu(xen_cr3, cr3);
645 647
648 op = mcs.args;
649 op->cmd = MMUEXT_NEW_BASEPTR;
650 op->arg1.mfn = mfn;
646 651
647 { 652 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
648 struct mmuext_op *op;
649 struct multicall_space mcs = xen_mc_entry(sizeof(*op));
650 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
651
652 op = mcs.args;
653 op->cmd = MMUEXT_NEW_BASEPTR;
654 op->arg1.mfn = mfn;
655 653
656 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 654 /* Update xen_update_cr3 once the batch has actually
655 been submitted. */
656 xen_mc_callback(set_current_cr3, (void *)cr3);
657 657
658 xen_mc_issue(PARAVIRT_LAZY_CPU); 658 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
659 }
660} 659}
661 660
662/* Early in boot, while setting up the initial pagetable, assume 661/* Early in boot, while setting up the initial pagetable, assume
@@ -667,6 +666,15 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
667 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 666 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
668} 667}
669 668
669static void pin_pagetable_pfn(unsigned level, unsigned long pfn)
670{
671 struct mmuext_op op;
672 op.cmd = level;
673 op.arg1.mfn = pfn_to_mfn(pfn);
674 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
675 BUG();
676}
677
670/* This needs to make sure the new pte page is pinned iff its being 678/* This needs to make sure the new pte page is pinned iff its being
671 attached to a pinned pagetable. */ 679 attached to a pinned pagetable. */
672static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) 680static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
@@ -676,9 +684,10 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
676 if (PagePinned(virt_to_page(mm->pgd))) { 684 if (PagePinned(virt_to_page(mm->pgd))) {
677 SetPagePinned(page); 685 SetPagePinned(page);
678 686
679 if (!PageHighMem(page)) 687 if (!PageHighMem(page)) {
680 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 688 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
681 else 689 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
690 } else
682 /* make sure there are no stray mappings of 691 /* make sure there are no stray mappings of
683 this page */ 692 this page */
684 kmap_flush_unused(); 693 kmap_flush_unused();
@@ -691,8 +700,10 @@ static void xen_release_pt(u32 pfn)
691 struct page *page = pfn_to_page(pfn); 700 struct page *page = pfn_to_page(pfn);
692 701
693 if (PagePinned(page)) { 702 if (PagePinned(page)) {
694 if (!PageHighMem(page)) 703 if (!PageHighMem(page)) {
704 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
695 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 705 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
706 }
696 } 707 }
697} 708}
698 709
@@ -737,7 +748,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base)
737 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; 748 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
738 749
739 /* special set_pte for pagetable initialization */ 750 /* special set_pte for pagetable initialization */
740 paravirt_ops.set_pte = xen_set_pte_init; 751 pv_mmu_ops.set_pte = xen_set_pte_init;
741 752
742 init_mm.pgd = base; 753 init_mm.pgd = base;
743 /* 754 /*
@@ -784,8 +795,8 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
784{ 795{
785 /* This will work as long as patching hasn't happened yet 796 /* This will work as long as patching hasn't happened yet
786 (which it hasn't) */ 797 (which it hasn't) */
787 paravirt_ops.alloc_pt = xen_alloc_pt; 798 pv_mmu_ops.alloc_pt = xen_alloc_pt;
788 paravirt_ops.set_pte = xen_set_pte; 799 pv_mmu_ops.set_pte = xen_set_pte;
789 800
790 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 801 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
791 /* 802 /*
@@ -807,15 +818,15 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
807 /* Actually pin the pagetable down, but we can't set PG_pinned 818 /* Actually pin the pagetable down, but we can't set PG_pinned
808 yet because the page structures don't exist yet. */ 819 yet because the page structures don't exist yet. */
809 { 820 {
810 struct mmuext_op op; 821 unsigned level;
822
811#ifdef CONFIG_X86_PAE 823#ifdef CONFIG_X86_PAE
812 op.cmd = MMUEXT_PIN_L3_TABLE; 824 level = MMUEXT_PIN_L3_TABLE;
813#else 825#else
814 op.cmd = MMUEXT_PIN_L3_TABLE; 826 level = MMUEXT_PIN_L2_TABLE;
815#endif 827#endif
816 op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); 828
817 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) 829 pin_pagetable_pfn(level, PFN_DOWN(__pa(base)));
818 BUG();
819 } 830 }
820} 831}
821 832
@@ -832,12 +843,12 @@ void __init xen_setup_vcpu_info_placement(void)
832 if (have_vcpu_info_placement) { 843 if (have_vcpu_info_placement) {
833 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 844 printk(KERN_INFO "Xen: using vcpu_info placement\n");
834 845
835 paravirt_ops.save_fl = xen_save_fl_direct; 846 pv_irq_ops.save_fl = xen_save_fl_direct;
836 paravirt_ops.restore_fl = xen_restore_fl_direct; 847 pv_irq_ops.restore_fl = xen_restore_fl_direct;
837 paravirt_ops.irq_disable = xen_irq_disable_direct; 848 pv_irq_ops.irq_disable = xen_irq_disable_direct;
838 paravirt_ops.irq_enable = xen_irq_enable_direct; 849 pv_irq_ops.irq_enable = xen_irq_enable_direct;
839 paravirt_ops.read_cr2 = xen_read_cr2_direct; 850 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
840 paravirt_ops.iret = xen_iret_direct; 851 pv_cpu_ops.iret = xen_iret_direct;
841 } 852 }
842} 853}
843 854
@@ -849,8 +860,8 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
849 860
850 start = end = reloc = NULL; 861 start = end = reloc = NULL;
851 862
852#define SITE(x) \ 863#define SITE(op, x) \
853 case PARAVIRT_PATCH(x): \ 864 case PARAVIRT_PATCH(op.x): \
854 if (have_vcpu_info_placement) { \ 865 if (have_vcpu_info_placement) { \
855 start = (char *)xen_##x##_direct; \ 866 start = (char *)xen_##x##_direct; \
856 end = xen_##x##_direct_end; \ 867 end = xen_##x##_direct_end; \
@@ -859,10 +870,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
859 goto patch_site 870 goto patch_site
860 871
861 switch (type) { 872 switch (type) {
862 SITE(irq_enable); 873 SITE(pv_irq_ops, irq_enable);
863 SITE(irq_disable); 874 SITE(pv_irq_ops, irq_disable);
864 SITE(save_fl); 875 SITE(pv_irq_ops, save_fl);
865 SITE(restore_fl); 876 SITE(pv_irq_ops, restore_fl);
866#undef SITE 877#undef SITE
867 878
868 patch_site: 879 patch_site:
@@ -894,26 +905,32 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
894 return ret; 905 return ret;
895} 906}
896 907
897static const struct paravirt_ops xen_paravirt_ops __initdata = { 908static const struct pv_info xen_info __initdata = {
898 .paravirt_enabled = 1, 909 .paravirt_enabled = 1,
899 .shared_kernel_pmd = 0, 910 .shared_kernel_pmd = 0,
900 911
901 .name = "Xen", 912 .name = "Xen",
902 .banner = xen_banner, 913};
903 914
915static const struct pv_init_ops xen_init_ops __initdata = {
904 .patch = xen_patch, 916 .patch = xen_patch,
905 917
918 .banner = xen_banner,
906 .memory_setup = xen_memory_setup, 919 .memory_setup = xen_memory_setup,
907 .arch_setup = xen_arch_setup, 920 .arch_setup = xen_arch_setup,
908 .init_IRQ = xen_init_IRQ,
909 .post_allocator_init = xen_mark_init_mm_pinned, 921 .post_allocator_init = xen_mark_init_mm_pinned,
922};
910 923
924static const struct pv_time_ops xen_time_ops __initdata = {
911 .time_init = xen_time_init, 925 .time_init = xen_time_init,
926
912 .set_wallclock = xen_set_wallclock, 927 .set_wallclock = xen_set_wallclock,
913 .get_wallclock = xen_get_wallclock, 928 .get_wallclock = xen_get_wallclock,
914 .get_cpu_khz = xen_cpu_khz, 929 .get_cpu_khz = xen_cpu_khz,
915 .sched_clock = xen_sched_clock, 930 .sched_clock = xen_sched_clock,
931};
916 932
933static const struct pv_cpu_ops xen_cpu_ops __initdata = {
917 .cpuid = xen_cpuid, 934 .cpuid = xen_cpuid,
918 935
919 .set_debugreg = xen_set_debugreg, 936 .set_debugreg = xen_set_debugreg,
@@ -924,22 +941,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
924 .read_cr0 = native_read_cr0, 941 .read_cr0 = native_read_cr0,
925 .write_cr0 = native_write_cr0, 942 .write_cr0 = native_write_cr0,
926 943
927 .read_cr2 = xen_read_cr2,
928 .write_cr2 = xen_write_cr2,
929
930 .read_cr3 = xen_read_cr3,
931 .write_cr3 = xen_write_cr3,
932
933 .read_cr4 = native_read_cr4, 944 .read_cr4 = native_read_cr4,
934 .read_cr4_safe = native_read_cr4_safe, 945 .read_cr4_safe = native_read_cr4_safe,
935 .write_cr4 = xen_write_cr4, 946 .write_cr4 = xen_write_cr4,
936 947
937 .save_fl = xen_save_fl,
938 .restore_fl = xen_restore_fl,
939 .irq_disable = xen_irq_disable,
940 .irq_enable = xen_irq_enable,
941 .safe_halt = xen_safe_halt,
942 .halt = xen_halt,
943 .wbinvd = native_wbinvd, 948 .wbinvd = native_wbinvd,
944 949
945 .read_msr = native_read_msr_safe, 950 .read_msr = native_read_msr_safe,
@@ -968,6 +973,23 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
968 .set_iopl_mask = xen_set_iopl_mask, 973 .set_iopl_mask = xen_set_iopl_mask,
969 .io_delay = xen_io_delay, 974 .io_delay = xen_io_delay,
970 975
976 .lazy_mode = {
977 .enter = paravirt_enter_lazy_cpu,
978 .leave = xen_leave_lazy,
979 },
980};
981
982static const struct pv_irq_ops xen_irq_ops __initdata = {
983 .init_IRQ = xen_init_IRQ,
984 .save_fl = xen_save_fl,
985 .restore_fl = xen_restore_fl,
986 .irq_disable = xen_irq_disable,
987 .irq_enable = xen_irq_enable,
988 .safe_halt = xen_safe_halt,
989 .halt = xen_halt,
990};
991
992static const struct pv_apic_ops xen_apic_ops __initdata = {
971#ifdef CONFIG_X86_LOCAL_APIC 993#ifdef CONFIG_X86_LOCAL_APIC
972 .apic_write = xen_apic_write, 994 .apic_write = xen_apic_write,
973 .apic_write_atomic = xen_apic_write, 995 .apic_write_atomic = xen_apic_write,
@@ -976,6 +998,17 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
976 .setup_secondary_clock = paravirt_nop, 998 .setup_secondary_clock = paravirt_nop,
977 .startup_ipi_hook = paravirt_nop, 999 .startup_ipi_hook = paravirt_nop,
978#endif 1000#endif
1001};
1002
1003static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1004 .pagetable_setup_start = xen_pagetable_setup_start,
1005 .pagetable_setup_done = xen_pagetable_setup_done,
1006
1007 .read_cr2 = xen_read_cr2,
1008 .write_cr2 = xen_write_cr2,
1009
1010 .read_cr3 = xen_read_cr3,
1011 .write_cr3 = xen_write_cr3,
979 1012
980 .flush_tlb_user = xen_flush_tlb, 1013 .flush_tlb_user = xen_flush_tlb,
981 .flush_tlb_kernel = xen_flush_tlb, 1014 .flush_tlb_kernel = xen_flush_tlb,
@@ -985,9 +1018,6 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
985 .pte_update = paravirt_nop, 1018 .pte_update = paravirt_nop,
986 .pte_update_defer = paravirt_nop, 1019 .pte_update_defer = paravirt_nop,
987 1020
988 .pagetable_setup_start = xen_pagetable_setup_start,
989 .pagetable_setup_done = xen_pagetable_setup_done,
990
991 .alloc_pt = xen_alloc_pt_init, 1021 .alloc_pt = xen_alloc_pt_init,
992 .release_pt = xen_release_pt, 1022 .release_pt = xen_release_pt,
993 .alloc_pd = paravirt_nop, 1023 .alloc_pd = paravirt_nop,
@@ -1023,7 +1053,10 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
1023 .dup_mmap = xen_dup_mmap, 1053 .dup_mmap = xen_dup_mmap,
1024 .exit_mmap = xen_exit_mmap, 1054 .exit_mmap = xen_exit_mmap,
1025 1055
1026 .set_lazy_mode = xen_set_lazy_mode, 1056 .lazy_mode = {
1057 .enter = paravirt_enter_lazy_mmu,
1058 .leave = xen_leave_lazy,
1059 },
1027}; 1060};
1028 1061
1029#ifdef CONFIG_SMP 1062#ifdef CONFIG_SMP
@@ -1079,6 +1112,17 @@ static const struct machine_ops __initdata xen_machine_ops = {
1079}; 1112};
1080 1113
1081 1114
1115static void __init xen_reserve_top(void)
1116{
1117 unsigned long top = HYPERVISOR_VIRT_START;
1118 struct xen_platform_parameters pp;
1119
1120 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1121 top = pp.virt_start;
1122
1123 reserve_top_address(-top + 2 * PAGE_SIZE);
1124}
1125
1082/* First C function to be called on Xen boot */ 1126/* First C function to be called on Xen boot */
1083asmlinkage void __init xen_start_kernel(void) 1127asmlinkage void __init xen_start_kernel(void)
1084{ 1128{
@@ -1090,7 +1134,14 @@ asmlinkage void __init xen_start_kernel(void)
1090 BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); 1134 BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
1091 1135
1092 /* Install Xen paravirt ops */ 1136 /* Install Xen paravirt ops */
1093 paravirt_ops = xen_paravirt_ops; 1137 pv_info = xen_info;
1138 pv_init_ops = xen_init_ops;
1139 pv_time_ops = xen_time_ops;
1140 pv_cpu_ops = xen_cpu_ops;
1141 pv_irq_ops = xen_irq_ops;
1142 pv_apic_ops = xen_apic_ops;
1143 pv_mmu_ops = xen_mmu_ops;
1144
1094 machine_ops = xen_machine_ops; 1145 machine_ops = xen_machine_ops;
1095 1146
1096#ifdef CONFIG_SMP 1147#ifdef CONFIG_SMP
@@ -1112,6 +1163,7 @@ asmlinkage void __init xen_start_kernel(void)
1112 /* keep using Xen gdt for now; no urgent need to change it */ 1163 /* keep using Xen gdt for now; no urgent need to change it */
1113 1164
1114 x86_write_percpu(xen_cr3, __pa(pgd)); 1165 x86_write_percpu(xen_cr3, __pa(pgd));
1166 x86_write_percpu(xen_current_cr3, __pa(pgd));
1115 1167
1116#ifdef CONFIG_SMP 1168#ifdef CONFIG_SMP
1117 /* Don't do the full vcpu_info placement stuff until we have a 1169 /* Don't do the full vcpu_info placement stuff until we have a
@@ -1123,12 +1175,12 @@ asmlinkage void __init xen_start_kernel(void)
1123 xen_setup_vcpu_info_placement(); 1175 xen_setup_vcpu_info_placement();
1124#endif 1176#endif
1125 1177
1126 paravirt_ops.kernel_rpl = 1; 1178 pv_info.kernel_rpl = 1;
1127 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1179 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1128 paravirt_ops.kernel_rpl = 0; 1180 pv_info.kernel_rpl = 0;
1129 1181
1130 /* set the limit of our address space */ 1182 /* set the limit of our address space */
1131 reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); 1183 xen_reserve_top();
1132 1184
1133 /* set up basic CPUID stuff */ 1185 /* set up basic CPUID stuff */
1134 cpu_detect(&new_cpu_data); 1186 cpu_detect(&new_cpu_data);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0bb7f0019100..b2e32f9d0071 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -154,7 +154,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
154 pte_t *ptep, pte_t pteval) 154 pte_t *ptep, pte_t pteval)
155{ 155{
156 if (mm == current->mm || mm == &init_mm) { 156 if (mm == current->mm || mm == &init_mm) {
157 if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 157 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
158 struct multicall_space mcs; 158 struct multicall_space mcs;
159 mcs = xen_mc_entry(0); 159 mcs = xen_mc_entry(0);
160 160
@@ -303,7 +303,12 @@ pgd_t xen_make_pgd(unsigned long pgd)
303} 303}
304#endif /* CONFIG_X86_PAE */ 304#endif /* CONFIG_X86_PAE */
305 305
306 306enum pt_level {
307 PT_PGD,
308 PT_PUD,
309 PT_PMD,
310 PT_PTE
311};
307 312
308/* 313/*
309 (Yet another) pagetable walker. This one is intended for pinning a 314 (Yet another) pagetable walker. This one is intended for pinning a
@@ -315,7 +320,7 @@ pgd_t xen_make_pgd(unsigned long pgd)
315 FIXADDR_TOP. But the important bit is that we don't pin beyond 320 FIXADDR_TOP. But the important bit is that we don't pin beyond
316 there, because then we start getting into Xen's ptes. 321 there, because then we start getting into Xen's ptes.
317*/ 322*/
318static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), 323static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
319 unsigned long limit) 324 unsigned long limit)
320{ 325{
321 pgd_t *pgd = pgd_base; 326 pgd_t *pgd = pgd_base;
@@ -340,7 +345,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
340 pud = pud_offset(pgd, 0); 345 pud = pud_offset(pgd, 0);
341 346
342 if (PTRS_PER_PUD > 1) /* not folded */ 347 if (PTRS_PER_PUD > 1) /* not folded */
343 flush |= (*func)(virt_to_page(pud), 0); 348 flush |= (*func)(virt_to_page(pud), PT_PUD);
344 349
345 for (; addr != pud_limit; pud++, addr = pud_next) { 350 for (; addr != pud_limit; pud++, addr = pud_next) {
346 pmd_t *pmd; 351 pmd_t *pmd;
@@ -359,7 +364,7 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
359 pmd = pmd_offset(pud, 0); 364 pmd = pmd_offset(pud, 0);
360 365
361 if (PTRS_PER_PMD > 1) /* not folded */ 366 if (PTRS_PER_PMD > 1) /* not folded */
362 flush |= (*func)(virt_to_page(pmd), 0); 367 flush |= (*func)(virt_to_page(pmd), PT_PMD);
363 368
364 for (; addr != pmd_limit; pmd++) { 369 for (; addr != pmd_limit; pmd++) {
365 addr += (PAGE_SIZE * PTRS_PER_PTE); 370 addr += (PAGE_SIZE * PTRS_PER_PTE);
@@ -371,17 +376,47 @@ static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned),
371 if (pmd_none(*pmd)) 376 if (pmd_none(*pmd))
372 continue; 377 continue;
373 378
374 flush |= (*func)(pmd_page(*pmd), 0); 379 flush |= (*func)(pmd_page(*pmd), PT_PTE);
375 } 380 }
376 } 381 }
377 } 382 }
378 383
379 flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); 384 flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
380 385
381 return flush; 386 return flush;
382} 387}
383 388
384static int pin_page(struct page *page, unsigned flags) 389static spinlock_t *lock_pte(struct page *page)
390{
391 spinlock_t *ptl = NULL;
392
393#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
394 ptl = __pte_lockptr(page);
395 spin_lock(ptl);
396#endif
397
398 return ptl;
399}
400
401static void do_unlock(void *v)
402{
403 spinlock_t *ptl = v;
404 spin_unlock(ptl);
405}
406
407static void xen_do_pin(unsigned level, unsigned long pfn)
408{
409 struct mmuext_op *op;
410 struct multicall_space mcs;
411
412 mcs = __xen_mc_entry(sizeof(*op));
413 op = mcs.args;
414 op->cmd = level;
415 op->arg1.mfn = pfn_to_mfn(pfn);
416 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
417}
418
419static int pin_page(struct page *page, enum pt_level level)
385{ 420{
386 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); 421 unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
387 int flush; 422 int flush;
@@ -396,12 +431,26 @@ static int pin_page(struct page *page, unsigned flags)
396 void *pt = lowmem_page_address(page); 431 void *pt = lowmem_page_address(page);
397 unsigned long pfn = page_to_pfn(page); 432 unsigned long pfn = page_to_pfn(page);
398 struct multicall_space mcs = __xen_mc_entry(0); 433 struct multicall_space mcs = __xen_mc_entry(0);
434 spinlock_t *ptl;
399 435
400 flush = 0; 436 flush = 0;
401 437
438 ptl = NULL;
439 if (level == PT_PTE)
440 ptl = lock_pte(page);
441
402 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 442 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
403 pfn_pte(pfn, PAGE_KERNEL_RO), 443 pfn_pte(pfn, PAGE_KERNEL_RO),
404 flags); 444 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
445
446 if (level == PT_PTE)
447 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
448
449 if (ptl) {
450 /* Queue a deferred unlock for when this batch
451 is completed. */
452 xen_mc_callback(do_unlock, ptl);
453 }
405 } 454 }
406 455
407 return flush; 456 return flush;
@@ -412,8 +461,7 @@ static int pin_page(struct page *page, unsigned flags)
412 read-only, and can be pinned. */ 461 read-only, and can be pinned. */
413void xen_pgd_pin(pgd_t *pgd) 462void xen_pgd_pin(pgd_t *pgd)
414{ 463{
415 struct multicall_space mcs; 464 unsigned level;
416 struct mmuext_op *op;
417 465
418 xen_mc_batch(); 466 xen_mc_batch();
419 467
@@ -424,16 +472,13 @@ void xen_pgd_pin(pgd_t *pgd)
424 xen_mc_batch(); 472 xen_mc_batch();
425 } 473 }
426 474
427 mcs = __xen_mc_entry(sizeof(*op));
428 op = mcs.args;
429
430#ifdef CONFIG_X86_PAE 475#ifdef CONFIG_X86_PAE
431 op->cmd = MMUEXT_PIN_L3_TABLE; 476 level = MMUEXT_PIN_L3_TABLE;
432#else 477#else
433 op->cmd = MMUEXT_PIN_L2_TABLE; 478 level = MMUEXT_PIN_L2_TABLE;
434#endif 479#endif
435 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); 480
436 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 481 xen_do_pin(level, PFN_DOWN(__pa(pgd)));
437 482
438 xen_mc_issue(0); 483 xen_mc_issue(0);
439} 484}
@@ -441,7 +486,7 @@ void xen_pgd_pin(pgd_t *pgd)
441/* The init_mm pagetable is really pinned as soon as its created, but 486/* The init_mm pagetable is really pinned as soon as its created, but
442 that's before we have page structures to store the bits. So do all 487 that's before we have page structures to store the bits. So do all
443 the book-keeping now. */ 488 the book-keeping now. */
444static __init int mark_pinned(struct page *page, unsigned flags) 489static __init int mark_pinned(struct page *page, enum pt_level level)
445{ 490{
446 SetPagePinned(page); 491 SetPagePinned(page);
447 return 0; 492 return 0;
@@ -452,18 +497,32 @@ void __init xen_mark_init_mm_pinned(void)
452 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 497 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP);
453} 498}
454 499
455static int unpin_page(struct page *page, unsigned flags) 500static int unpin_page(struct page *page, enum pt_level level)
456{ 501{
457 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); 502 unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
458 503
459 if (pgfl && !PageHighMem(page)) { 504 if (pgfl && !PageHighMem(page)) {
460 void *pt = lowmem_page_address(page); 505 void *pt = lowmem_page_address(page);
461 unsigned long pfn = page_to_pfn(page); 506 unsigned long pfn = page_to_pfn(page);
462 struct multicall_space mcs = __xen_mc_entry(0); 507 spinlock_t *ptl = NULL;
508 struct multicall_space mcs;
509
510 if (level == PT_PTE) {
511 ptl = lock_pte(page);
512
513 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
514 }
515
516 mcs = __xen_mc_entry(0);
463 517
464 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 518 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
465 pfn_pte(pfn, PAGE_KERNEL), 519 pfn_pte(pfn, PAGE_KERNEL),
466 flags); 520 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
521
522 if (ptl) {
523 /* unlock when batch completed */
524 xen_mc_callback(do_unlock, ptl);
525 }
467 } 526 }
468 527
469 return 0; /* never need to flush on unpin */ 528 return 0; /* never need to flush on unpin */
@@ -472,18 +531,9 @@ static int unpin_page(struct page *page, unsigned flags)
472/* Release a pagetables pages back as normal RW */ 531/* Release a pagetables pages back as normal RW */
473static void xen_pgd_unpin(pgd_t *pgd) 532static void xen_pgd_unpin(pgd_t *pgd)
474{ 533{
475 struct mmuext_op *op;
476 struct multicall_space mcs;
477
478 xen_mc_batch(); 534 xen_mc_batch();
479 535
480 mcs = __xen_mc_entry(sizeof(*op)); 536 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
481
482 op = mcs.args;
483 op->cmd = MMUEXT_UNPIN_TABLE;
484 op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
485
486 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
487 537
488 pgd_walk(pgd, unpin_page, TASK_SIZE); 538 pgd_walk(pgd, unpin_page, TASK_SIZE);
489 539
@@ -514,20 +564,43 @@ static void drop_other_mm_ref(void *info)
514 564
515 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 565 if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
516 leave_mm(smp_processor_id()); 566 leave_mm(smp_processor_id());
567
568 /* If this cpu still has a stale cr3 reference, then make sure
569 it has been flushed. */
570 if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
571 load_cr3(swapper_pg_dir);
572 arch_flush_lazy_cpu_mode();
573 }
517} 574}
518 575
519static void drop_mm_ref(struct mm_struct *mm) 576static void drop_mm_ref(struct mm_struct *mm)
520{ 577{
578 cpumask_t mask;
579 unsigned cpu;
580
521 if (current->active_mm == mm) { 581 if (current->active_mm == mm) {
522 if (current->mm == mm) 582 if (current->mm == mm)
523 load_cr3(swapper_pg_dir); 583 load_cr3(swapper_pg_dir);
524 else 584 else
525 leave_mm(smp_processor_id()); 585 leave_mm(smp_processor_id());
586 arch_flush_lazy_cpu_mode();
526 } 587 }
527 588
528 if (!cpus_empty(mm->cpu_vm_mask)) 589 /* Get the "official" set of cpus referring to our pagetable. */
529 xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, 590 mask = mm->cpu_vm_mask;
530 mm, 1); 591
592 /* It's possible that a vcpu may have a stale reference to our
593 cr3, because its in lazy mode, and it hasn't yet flushed
594 its set of pending hypercalls yet. In this case, we can
595 look at its actual current cr3 value, and force it to flush
596 if needed. */
597 for_each_online_cpu(cpu) {
598 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
599 cpu_set(cpu, mask);
600 }
601
602 if (!cpus_empty(mask))
603 xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
531} 604}
532#else 605#else
533static void drop_mm_ref(struct mm_struct *mm) 606static void drop_mm_ref(struct mm_struct *mm)
@@ -562,5 +635,6 @@ void xen_exit_mmap(struct mm_struct *mm)
562 /* pgd may not be pinned in the error exit path of execve */ 635 /* pgd may not be pinned in the error exit path of execve */
563 if (PagePinned(virt_to_page(mm->pgd))) 636 if (PagePinned(virt_to_page(mm->pgd)))
564 xen_pgd_unpin(mm->pgd); 637 xen_pgd_unpin(mm->pgd);
638
565 spin_unlock(&mm->page_table_lock); 639 spin_unlock(&mm->page_table_lock);
566} 640}
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index c837e8e463db..5e6f36f6d876 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -26,13 +26,22 @@
26 26
27#include "multicalls.h" 27#include "multicalls.h"
28 28
29#define MC_DEBUG 1
30
29#define MC_BATCH 32 31#define MC_BATCH 32
30#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) 32#define MC_ARGS (MC_BATCH * 16 / sizeof(u64))
31 33
32struct mc_buffer { 34struct mc_buffer {
33 struct multicall_entry entries[MC_BATCH]; 35 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG
37 struct multicall_entry debug[MC_BATCH];
38#endif
34 u64 args[MC_ARGS]; 39 u64 args[MC_ARGS];
35 unsigned mcidx, argidx; 40 struct callback {
41 void (*fn)(void *);
42 void *data;
43 } callbacks[MC_BATCH];
44 unsigned mcidx, argidx, cbidx;
36}; 45};
37 46
38static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 47static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
@@ -43,6 +52,7 @@ void xen_mc_flush(void)
43 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 52 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
44 int ret = 0; 53 int ret = 0;
45 unsigned long flags; 54 unsigned long flags;
55 int i;
46 56
47 BUG_ON(preemptible()); 57 BUG_ON(preemptible());
48 58
@@ -51,13 +61,31 @@ void xen_mc_flush(void)
51 local_irq_save(flags); 61 local_irq_save(flags);
52 62
53 if (b->mcidx) { 63 if (b->mcidx) {
54 int i; 64#if MC_DEBUG
65 memcpy(b->debug, b->entries,
66 b->mcidx * sizeof(struct multicall_entry));
67#endif
55 68
56 if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) 69 if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
57 BUG(); 70 BUG();
58 for (i = 0; i < b->mcidx; i++) 71 for (i = 0; i < b->mcidx; i++)
59 if (b->entries[i].result < 0) 72 if (b->entries[i].result < 0)
60 ret++; 73 ret++;
74
75#if MC_DEBUG
76 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id());
79 for(i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx,
82 b->debug[i].op,
83 b->debug[i].args[0],
84 b->entries[i].result);
85 }
86 }
87#endif
88
61 b->mcidx = 0; 89 b->mcidx = 0;
62 b->argidx = 0; 90 b->argidx = 0;
63 } else 91 } else
@@ -65,6 +93,13 @@ void xen_mc_flush(void)
65 93
66 local_irq_restore(flags); 94 local_irq_restore(flags);
67 95
96 for(i = 0; i < b->cbidx; i++) {
97 struct callback *cb = &b->callbacks[i];
98
99 (*cb->fn)(cb->data);
100 }
101 b->cbidx = 0;
102
68 BUG_ON(ret); 103 BUG_ON(ret);
69} 104}
70 105
@@ -88,3 +123,16 @@ struct multicall_space __xen_mc_entry(size_t args)
88 123
89 return ret; 124 return ret;
90} 125}
126
127void xen_mc_callback(void (*fn)(void *), void *data)
128{
129 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
130 struct callback *cb;
131
132 if (b->cbidx == MC_BATCH)
133 xen_mc_flush();
134
135 cb = &b->callbacks[b->cbidx++];
136 cb->fn = fn;
137 cb->data = data;
138}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index e6f7530b156c..8bae996d99a3 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -35,11 +35,14 @@ void xen_mc_flush(void);
35/* Issue a multicall if we're not in a lazy mode */ 35/* Issue a multicall if we're not in a lazy mode */
36static inline void xen_mc_issue(unsigned mode) 36static inline void xen_mc_issue(unsigned mode)
37{ 37{
38 if ((xen_get_lazy_mode() & mode) == 0) 38 if ((paravirt_get_lazy_mode() & mode) == 0)
39 xen_mc_flush(); 39 xen_mc_flush();
40 40
41 /* restore flags saved in xen_mc_batch */ 41 /* restore flags saved in xen_mc_batch */
42 local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); 42 local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
43} 43}
44 44
45/* Set up a callback to be called when the current batch is flushed */
46void xen_mc_callback(void (*fn)(void *), void *data);
47
45#endif /* _XEN_MULTICALLS_H */ 48#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 6c058585459c..c1b131bcdcbe 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -371,7 +371,8 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
371 void *info, int wait) 371 void *info, int wait)
372{ 372{
373 struct call_data_struct data; 373 struct call_data_struct data;
374 int cpus; 374 int cpus, cpu;
375 bool yield;
375 376
376 /* Holding any lock stops cpus from going down. */ 377 /* Holding any lock stops cpus from going down. */
377 spin_lock(&call_lock); 378 spin_lock(&call_lock);
@@ -400,9 +401,14 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
400 /* Send a message to other CPUs and wait for them to respond */ 401 /* Send a message to other CPUs and wait for them to respond */
401 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 402 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
402 403
403 /* Make sure other vcpus get a chance to run. 404 /* Make sure other vcpus get a chance to run if they need to. */
404 XXX too severe? Maybe we should check the other CPU's states? */ 405 yield = false;
405 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 406 for_each_cpu_mask(cpu, mask)
407 if (xen_vcpu_stolen(cpu))
408 yield = true;
409
410 if (yield)
411 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
406 412
407 /* Wait for response */ 413 /* Wait for response */
408 while (atomic_read(&data.started) != cpus || 414 while (atomic_read(&data.started) != cpus ||
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index dfd6db69ead5..d083ff5ef088 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -105,6 +105,12 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
105 } while (get64(&state->state_entry_time) != state_time); 105 } while (get64(&state->state_entry_time) != state_time);
106} 106}
107 107
108/* return true when a vcpu could run but has no real cpu to run on */
109bool xen_vcpu_stolen(int vcpu)
110{
111 return per_cpu(runstate, vcpu).state == RUNSTATE_runnable;
112}
113
108static void setup_runstate_info(int cpu) 114static void setup_runstate_info(int cpu)
109{ 115{
110 struct vcpu_register_runstate_memory_area area; 116 struct vcpu_register_runstate_memory_area area;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index b9aaea45f07f..b02a909bfd4c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -11,6 +11,7 @@ void xen_copy_trap_info(struct trap_info *traps);
11 11
12DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); 12DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
13DECLARE_PER_CPU(unsigned long, xen_cr3); 13DECLARE_PER_CPU(unsigned long, xen_cr3);
14DECLARE_PER_CPU(unsigned long, xen_current_cr3);
14 15
15extern struct start_info *xen_start_info; 16extern struct start_info *xen_start_info;
16extern struct shared_info *HYPERVISOR_shared_info; 17extern struct shared_info *HYPERVISOR_shared_info;
@@ -27,14 +28,9 @@ unsigned long xen_get_wallclock(void);
27int xen_set_wallclock(unsigned long time); 28int xen_set_wallclock(unsigned long time);
28unsigned long long xen_sched_clock(void); 29unsigned long long xen_sched_clock(void);
29 30
30void xen_mark_init_mm_pinned(void); 31bool xen_vcpu_stolen(int vcpu);
31
32DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode);
33 32
34static inline unsigned xen_get_lazy_mode(void) 33void xen_mark_init_mm_pinned(void);
35{
36 return x86_read_percpu(xen_lazy_mode);
37}
38 34
39void __init xen_fill_possible_map(void); 35void __init xen_fill_possible_map(void);
40 36