aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/paravirt.h435
-rw-r--r--arch/x86/include/asm/processor.h3
-rw-r--r--arch/x86/kernel/cpu/common.c25
-rw-r--r--arch/x86/kernel/entry_64.S2
-rw-r--r--arch/x86/kernel/paravirt.c54
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c12
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c15
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/tlb_uv.c2
-rw-r--r--arch/x86/kernel/vmi_32.c9
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S5
-rw-r--r--arch/x86/kernel/vsmp_64.c12
-rw-r--r--arch/x86/lguest/boot.c13
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c4
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c763
-rw-r--r--arch/x86/xen/irq.c14
-rw-r--r--arch/x86/xen/mmu.c745
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/smp.c7
-rw-r--r--arch/x86/xen/xen-asm.S140
-rw-r--r--arch/x86/xen/xen-asm.h12
-rw-r--r--arch/x86/xen/xen-asm_32.S111
-rw-r--r--arch/x86/xen/xen-asm_64.S134
-rw-r--r--arch/x86/xen/xen-ops.h10
-rw-r--r--include/asm-generic/vmlinux.lds.h29
27 files changed, 1399 insertions, 1167 deletions
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 7e674ea80f0d..ff691736f5e9 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -12,21 +12,38 @@
12#define CLBR_EAX (1 << 0) 12#define CLBR_EAX (1 << 0)
13#define CLBR_ECX (1 << 1) 13#define CLBR_ECX (1 << 1)
14#define CLBR_EDX (1 << 2) 14#define CLBR_EDX (1 << 2)
15#define CLBR_EDI (1 << 3)
15 16
16#ifdef CONFIG_X86_64 17#ifdef CONFIG_X86_32
17#define CLBR_RSI (1 << 3) 18/* CLBR_ANY should match all regs platform has. For i386, that's just it */
18#define CLBR_RDI (1 << 4) 19#define CLBR_ANY ((1 << 4) - 1)
20
21#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
22#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
23#define CLBR_SCRATCH (0)
24#else
25#define CLBR_RAX CLBR_EAX
26#define CLBR_RCX CLBR_ECX
27#define CLBR_RDX CLBR_EDX
28#define CLBR_RDI CLBR_EDI
29#define CLBR_RSI (1 << 4)
19#define CLBR_R8 (1 << 5) 30#define CLBR_R8 (1 << 5)
20#define CLBR_R9 (1 << 6) 31#define CLBR_R9 (1 << 6)
21#define CLBR_R10 (1 << 7) 32#define CLBR_R10 (1 << 7)
22#define CLBR_R11 (1 << 8) 33#define CLBR_R11 (1 << 8)
34
23#define CLBR_ANY ((1 << 9) - 1) 35#define CLBR_ANY ((1 << 9) - 1)
36
37#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
38 CLBR_RCX | CLBR_R8 | CLBR_R9)
39#define CLBR_RET_REG (CLBR_RAX)
40#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11)
41
24#include <asm/desc_defs.h> 42#include <asm/desc_defs.h>
25#else
26/* CLBR_ANY should match all regs platform has. For i386, that's just it */
27#define CLBR_ANY ((1 << 3) - 1)
28#endif /* X86_64 */ 43#endif /* X86_64 */
29 44
45#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
46
30#ifndef __ASSEMBLY__ 47#ifndef __ASSEMBLY__
31#include <linux/types.h> 48#include <linux/types.h>
32#include <linux/cpumask.h> 49#include <linux/cpumask.h>
@@ -40,6 +57,14 @@ struct tss_struct;
40struct mm_struct; 57struct mm_struct;
41struct desc_struct; 58struct desc_struct;
42 59
60/*
61 * Wrapper type for pointers to code which uses the non-standard
62 * calling convention. See PV_CALL_SAVE_REGS_THUNK below.
63 */
64struct paravirt_callee_save {
65 void *func;
66};
67
43/* general info */ 68/* general info */
44struct pv_info { 69struct pv_info {
45 unsigned int kernel_rpl; 70 unsigned int kernel_rpl;
@@ -189,11 +214,15 @@ struct pv_irq_ops {
189 * expected to use X86_EFLAGS_IF; all other bits 214 * expected to use X86_EFLAGS_IF; all other bits
190 * returned from save_fl are undefined, and may be ignored by 215 * returned from save_fl are undefined, and may be ignored by
191 * restore_fl. 216 * restore_fl.
217 *
218 * NOTE: These functions callers expect the callee to preserve
219 * more registers than the standard C calling convention.
192 */ 220 */
193 unsigned long (*save_fl)(void); 221 struct paravirt_callee_save save_fl;
194 void (*restore_fl)(unsigned long); 222 struct paravirt_callee_save restore_fl;
195 void (*irq_disable)(void); 223 struct paravirt_callee_save irq_disable;
196 void (*irq_enable)(void); 224 struct paravirt_callee_save irq_enable;
225
197 void (*safe_halt)(void); 226 void (*safe_halt)(void);
198 void (*halt)(void); 227 void (*halt)(void);
199 228
@@ -279,11 +308,11 @@ struct pv_mmu_ops {
279 void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, 308 void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
280 pte_t *ptep, pte_t pte); 309 pte_t *ptep, pte_t pte);
281 310
282 pteval_t (*pte_val)(pte_t); 311 struct paravirt_callee_save pte_val;
283 pte_t (*make_pte)(pteval_t pte); 312 struct paravirt_callee_save make_pte;
284 313
285 pgdval_t (*pgd_val)(pgd_t); 314 struct paravirt_callee_save pgd_val;
286 pgd_t (*make_pgd)(pgdval_t pgd); 315 struct paravirt_callee_save make_pgd;
287 316
288#if PAGETABLE_LEVELS >= 3 317#if PAGETABLE_LEVELS >= 3
289#ifdef CONFIG_X86_PAE 318#ifdef CONFIG_X86_PAE
@@ -298,12 +327,12 @@ struct pv_mmu_ops {
298 327
299 void (*set_pud)(pud_t *pudp, pud_t pudval); 328 void (*set_pud)(pud_t *pudp, pud_t pudval);
300 329
301 pmdval_t (*pmd_val)(pmd_t); 330 struct paravirt_callee_save pmd_val;
302 pmd_t (*make_pmd)(pmdval_t pmd); 331 struct paravirt_callee_save make_pmd;
303 332
304#if PAGETABLE_LEVELS == 4 333#if PAGETABLE_LEVELS == 4
305 pudval_t (*pud_val)(pud_t); 334 struct paravirt_callee_save pud_val;
306 pud_t (*make_pud)(pudval_t pud); 335 struct paravirt_callee_save make_pud;
307 336
308 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); 337 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
309#endif /* PAGETABLE_LEVELS == 4 */ 338#endif /* PAGETABLE_LEVELS == 4 */
@@ -388,6 +417,8 @@ extern struct pv_lock_ops pv_lock_ops;
388 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") 417 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
389 418
390unsigned paravirt_patch_nop(void); 419unsigned paravirt_patch_nop(void);
420unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
421unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
391unsigned paravirt_patch_ignore(unsigned len); 422unsigned paravirt_patch_ignore(unsigned len);
392unsigned paravirt_patch_call(void *insnbuf, 423unsigned paravirt_patch_call(void *insnbuf,
393 const void *target, u16 tgt_clobbers, 424 const void *target, u16 tgt_clobbers,
@@ -479,25 +510,45 @@ int paravirt_disable_iospace(void);
479 * makes sure the incoming and outgoing types are always correct. 510 * makes sure the incoming and outgoing types are always correct.
480 */ 511 */
481#ifdef CONFIG_X86_32 512#ifdef CONFIG_X86_32
482#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx 513#define PVOP_VCALL_ARGS \
514 unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx
483#define PVOP_CALL_ARGS PVOP_VCALL_ARGS 515#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
516
517#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
518#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x))
519#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x))
520
484#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \ 521#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
485 "=c" (__ecx) 522 "=c" (__ecx)
486#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS 523#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
524
525#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), "=d" (__edx)
526#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
527
487#define EXTRA_CLOBBERS 528#define EXTRA_CLOBBERS
488#define VEXTRA_CLOBBERS 529#define VEXTRA_CLOBBERS
489#else 530#else /* CONFIG_X86_64 */
490#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx 531#define PVOP_VCALL_ARGS \
532 unsigned long __edi = __edi, __esi = __esi, \
533 __edx = __edx, __ecx = __ecx
491#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax 534#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
535
536#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
537#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
538#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x))
539#define PVOP_CALL_ARG4(x) "c" ((unsigned long)(x))
540
492#define PVOP_VCALL_CLOBBERS "=D" (__edi), \ 541#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
493 "=S" (__esi), "=d" (__edx), \ 542 "=S" (__esi), "=d" (__edx), \
494 "=c" (__ecx) 543 "=c" (__ecx)
495
496#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax) 544#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
497 545
546#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
547#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
548
498#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11" 549#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
499#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11" 550#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
500#endif 551#endif /* CONFIG_X86_32 */
501 552
502#ifdef CONFIG_PARAVIRT_DEBUG 553#ifdef CONFIG_PARAVIRT_DEBUG
503#define PVOP_TEST_NULL(op) BUG_ON(op == NULL) 554#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
@@ -505,10 +556,11 @@ int paravirt_disable_iospace(void);
505#define PVOP_TEST_NULL(op) ((void)op) 556#define PVOP_TEST_NULL(op) ((void)op)
506#endif 557#endif
507 558
508#define __PVOP_CALL(rettype, op, pre, post, ...) \ 559#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \
560 pre, post, ...) \
509 ({ \ 561 ({ \
510 rettype __ret; \ 562 rettype __ret; \
511 PVOP_CALL_ARGS; \ 563 PVOP_CALL_ARGS; \
512 PVOP_TEST_NULL(op); \ 564 PVOP_TEST_NULL(op); \
513 /* This is 32-bit specific, but is okay in 64-bit */ \ 565 /* This is 32-bit specific, but is okay in 64-bit */ \
514 /* since this condition will never hold */ \ 566 /* since this condition will never hold */ \
@@ -516,70 +568,113 @@ int paravirt_disable_iospace(void);
516 asm volatile(pre \ 568 asm volatile(pre \
517 paravirt_alt(PARAVIRT_CALL) \ 569 paravirt_alt(PARAVIRT_CALL) \
518 post \ 570 post \
519 : PVOP_CALL_CLOBBERS \ 571 : call_clbr \
520 : paravirt_type(op), \ 572 : paravirt_type(op), \
521 paravirt_clobber(CLBR_ANY), \ 573 paravirt_clobber(clbr), \
522 ##__VA_ARGS__ \ 574 ##__VA_ARGS__ \
523 : "memory", "cc" EXTRA_CLOBBERS); \ 575 : "memory", "cc" extra_clbr); \
524 __ret = (rettype)((((u64)__edx) << 32) | __eax); \ 576 __ret = (rettype)((((u64)__edx) << 32) | __eax); \
525 } else { \ 577 } else { \
526 asm volatile(pre \ 578 asm volatile(pre \
527 paravirt_alt(PARAVIRT_CALL) \ 579 paravirt_alt(PARAVIRT_CALL) \
528 post \ 580 post \
529 : PVOP_CALL_CLOBBERS \ 581 : call_clbr \
530 : paravirt_type(op), \ 582 : paravirt_type(op), \
531 paravirt_clobber(CLBR_ANY), \ 583 paravirt_clobber(clbr), \
532 ##__VA_ARGS__ \ 584 ##__VA_ARGS__ \
533 : "memory", "cc" EXTRA_CLOBBERS); \ 585 : "memory", "cc" extra_clbr); \
534 __ret = (rettype)__eax; \ 586 __ret = (rettype)__eax; \
535 } \ 587 } \
536 __ret; \ 588 __ret; \
537 }) 589 })
538#define __PVOP_VCALL(op, pre, post, ...) \ 590
591#define __PVOP_CALL(rettype, op, pre, post, ...) \
592 ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \
593 EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
594
595#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \
596 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
597 PVOP_CALLEE_CLOBBERS, , \
598 pre, post, ##__VA_ARGS__)
599
600
601#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \
539 ({ \ 602 ({ \
540 PVOP_VCALL_ARGS; \ 603 PVOP_VCALL_ARGS; \
541 PVOP_TEST_NULL(op); \ 604 PVOP_TEST_NULL(op); \
542 asm volatile(pre \ 605 asm volatile(pre \
543 paravirt_alt(PARAVIRT_CALL) \ 606 paravirt_alt(PARAVIRT_CALL) \
544 post \ 607 post \
545 : PVOP_VCALL_CLOBBERS \ 608 : call_clbr \
546 : paravirt_type(op), \ 609 : paravirt_type(op), \
547 paravirt_clobber(CLBR_ANY), \ 610 paravirt_clobber(clbr), \
548 ##__VA_ARGS__ \ 611 ##__VA_ARGS__ \
549 : "memory", "cc" VEXTRA_CLOBBERS); \ 612 : "memory", "cc" extra_clbr); \
550 }) 613 })
551 614
615#define __PVOP_VCALL(op, pre, post, ...) \
616 ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
617 VEXTRA_CLOBBERS, \
618 pre, post, ##__VA_ARGS__)
619
620#define __PVOP_VCALLEESAVE(rettype, op, pre, post, ...) \
621 ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
622 PVOP_VCALLEE_CLOBBERS, , \
623 pre, post, ##__VA_ARGS__)
624
625
626
552#define PVOP_CALL0(rettype, op) \ 627#define PVOP_CALL0(rettype, op) \
553 __PVOP_CALL(rettype, op, "", "") 628 __PVOP_CALL(rettype, op, "", "")
554#define PVOP_VCALL0(op) \ 629#define PVOP_VCALL0(op) \
555 __PVOP_VCALL(op, "", "") 630 __PVOP_VCALL(op, "", "")
556 631
632#define PVOP_CALLEE0(rettype, op) \
633 __PVOP_CALLEESAVE(rettype, op, "", "")
634#define PVOP_VCALLEE0(op) \
635 __PVOP_VCALLEESAVE(op, "", "")
636
637
557#define PVOP_CALL1(rettype, op, arg1) \ 638#define PVOP_CALL1(rettype, op, arg1) \
558 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1))) 639 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
559#define PVOP_VCALL1(op, arg1) \ 640#define PVOP_VCALL1(op, arg1) \
560 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1))) 641 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
642
643#define PVOP_CALLEE1(rettype, op, arg1) \
644 __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
645#define PVOP_VCALLEE1(op, arg1) \
646 __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
647
561 648
562#define PVOP_CALL2(rettype, op, arg1, arg2) \ 649#define PVOP_CALL2(rettype, op, arg1, arg2) \
563 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ 650 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
564 "1" ((unsigned long)(arg2))) 651 PVOP_CALL_ARG2(arg2))
565#define PVOP_VCALL2(op, arg1, arg2) \ 652#define PVOP_VCALL2(op, arg1, arg2) \
566 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ 653 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
567 "1" ((unsigned long)(arg2))) 654 PVOP_CALL_ARG2(arg2))
655
656#define PVOP_CALLEE2(rettype, op, arg1, arg2) \
657 __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
658 PVOP_CALL_ARG2(arg2))
659#define PVOP_VCALLEE2(op, arg1, arg2) \
660 __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \
661 PVOP_CALL_ARG2(arg2))
662
568 663
569#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \ 664#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
570 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ 665 __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
571 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) 666 PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
572#define PVOP_VCALL3(op, arg1, arg2, arg3) \ 667#define PVOP_VCALL3(op, arg1, arg2, arg3) \
573 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ 668 __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
574 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3))) 669 PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
575 670
576/* This is the only difference in x86_64. We can make it much simpler */ 671/* This is the only difference in x86_64. We can make it much simpler */
577#ifdef CONFIG_X86_32 672#ifdef CONFIG_X86_32
578#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ 673#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
579 __PVOP_CALL(rettype, op, \ 674 __PVOP_CALL(rettype, op, \
580 "push %[_arg4];", "lea 4(%%esp),%%esp;", \ 675 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
581 "0" ((u32)(arg1)), "1" ((u32)(arg2)), \ 676 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
582 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) 677 PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
583#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ 678#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
584 __PVOP_VCALL(op, \ 679 __PVOP_VCALL(op, \
585 "push %[_arg4];", "lea 4(%%esp),%%esp;", \ 680 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
@@ -587,13 +682,13 @@ int paravirt_disable_iospace(void);
587 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4))) 682 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
588#else 683#else
589#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \ 684#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
590 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \ 685 __PVOP_CALL(rettype, op, "", "", \
591 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ 686 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
592 "3"((unsigned long)(arg4))) 687 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
593#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \ 688#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
594 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \ 689 __PVOP_VCALL(op, "", "", \
595 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \ 690 PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
596 "3"((unsigned long)(arg4))) 691 PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
597#endif 692#endif
598 693
599static inline int paravirt_enabled(void) 694static inline int paravirt_enabled(void)
@@ -1060,13 +1155,13 @@ static inline pte_t __pte(pteval_t val)
1060 pteval_t ret; 1155 pteval_t ret;
1061 1156
1062 if (sizeof(pteval_t) > sizeof(long)) 1157 if (sizeof(pteval_t) > sizeof(long))
1063 ret = PVOP_CALL2(pteval_t, 1158 ret = PVOP_CALLEE2(pteval_t,
1064 pv_mmu_ops.make_pte, 1159 pv_mmu_ops.make_pte,
1065 val, (u64)val >> 32); 1160 val, (u64)val >> 32);
1066 else 1161 else
1067 ret = PVOP_CALL1(pteval_t, 1162 ret = PVOP_CALLEE1(pteval_t,
1068 pv_mmu_ops.make_pte, 1163 pv_mmu_ops.make_pte,
1069 val); 1164 val);
1070 1165
1071 return (pte_t) { .pte = ret }; 1166 return (pte_t) { .pte = ret };
1072} 1167}
@@ -1076,11 +1171,11 @@ static inline pteval_t pte_val(pte_t pte)
1076 pteval_t ret; 1171 pteval_t ret;
1077 1172
1078 if (sizeof(pteval_t) > sizeof(long)) 1173 if (sizeof(pteval_t) > sizeof(long))
1079 ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val, 1174 ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val,
1080 pte.pte, (u64)pte.pte >> 32); 1175 pte.pte, (u64)pte.pte >> 32);
1081 else 1176 else
1082 ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val, 1177 ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val,
1083 pte.pte); 1178 pte.pte);
1084 1179
1085 return ret; 1180 return ret;
1086} 1181}
@@ -1090,11 +1185,11 @@ static inline pgd_t __pgd(pgdval_t val)
1090 pgdval_t ret; 1185 pgdval_t ret;
1091 1186
1092 if (sizeof(pgdval_t) > sizeof(long)) 1187 if (sizeof(pgdval_t) > sizeof(long))
1093 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd, 1188 ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd,
1094 val, (u64)val >> 32); 1189 val, (u64)val >> 32);
1095 else 1190 else
1096 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd, 1191 ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd,
1097 val); 1192 val);
1098 1193
1099 return (pgd_t) { ret }; 1194 return (pgd_t) { ret };
1100} 1195}
@@ -1104,11 +1199,11 @@ static inline pgdval_t pgd_val(pgd_t pgd)
1104 pgdval_t ret; 1199 pgdval_t ret;
1105 1200
1106 if (sizeof(pgdval_t) > sizeof(long)) 1201 if (sizeof(pgdval_t) > sizeof(long))
1107 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val, 1202 ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val,
1108 pgd.pgd, (u64)pgd.pgd >> 32); 1203 pgd.pgd, (u64)pgd.pgd >> 32);
1109 else 1204 else
1110 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val, 1205 ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val,
1111 pgd.pgd); 1206 pgd.pgd);
1112 1207
1113 return ret; 1208 return ret;
1114} 1209}
@@ -1172,11 +1267,11 @@ static inline pmd_t __pmd(pmdval_t val)
1172 pmdval_t ret; 1267 pmdval_t ret;
1173 1268
1174 if (sizeof(pmdval_t) > sizeof(long)) 1269 if (sizeof(pmdval_t) > sizeof(long))
1175 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd, 1270 ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd,
1176 val, (u64)val >> 32); 1271 val, (u64)val >> 32);
1177 else 1272 else
1178 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd, 1273 ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd,
1179 val); 1274 val);
1180 1275
1181 return (pmd_t) { ret }; 1276 return (pmd_t) { ret };
1182} 1277}
@@ -1186,11 +1281,11 @@ static inline pmdval_t pmd_val(pmd_t pmd)
1186 pmdval_t ret; 1281 pmdval_t ret;
1187 1282
1188 if (sizeof(pmdval_t) > sizeof(long)) 1283 if (sizeof(pmdval_t) > sizeof(long))
1189 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val, 1284 ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val,
1190 pmd.pmd, (u64)pmd.pmd >> 32); 1285 pmd.pmd, (u64)pmd.pmd >> 32);
1191 else 1286 else
1192 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val, 1287 ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val,
1193 pmd.pmd); 1288 pmd.pmd);
1194 1289
1195 return ret; 1290 return ret;
1196} 1291}
@@ -1212,11 +1307,11 @@ static inline pud_t __pud(pudval_t val)
1212 pudval_t ret; 1307 pudval_t ret;
1213 1308
1214 if (sizeof(pudval_t) > sizeof(long)) 1309 if (sizeof(pudval_t) > sizeof(long))
1215 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud, 1310 ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud,
1216 val, (u64)val >> 32); 1311 val, (u64)val >> 32);
1217 else 1312 else
1218 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud, 1313 ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud,
1219 val); 1314 val);
1220 1315
1221 return (pud_t) { ret }; 1316 return (pud_t) { ret };
1222} 1317}
@@ -1226,11 +1321,11 @@ static inline pudval_t pud_val(pud_t pud)
1226 pudval_t ret; 1321 pudval_t ret;
1227 1322
1228 if (sizeof(pudval_t) > sizeof(long)) 1323 if (sizeof(pudval_t) > sizeof(long))
1229 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val, 1324 ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.pud_val,
1230 pud.pud, (u64)pud.pud >> 32); 1325 pud.pud, (u64)pud.pud >> 32);
1231 else 1326 else
1232 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val, 1327 ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.pud_val,
1233 pud.pud); 1328 pud.pud);
1234 1329
1235 return ret; 1330 return ret;
1236} 1331}
@@ -1371,6 +1466,9 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
1371} 1466}
1372 1467
1373void _paravirt_nop(void); 1468void _paravirt_nop(void);
1469u32 _paravirt_ident_32(u32);
1470u64 _paravirt_ident_64(u64);
1471
1374#define paravirt_nop ((void *)_paravirt_nop) 1472#define paravirt_nop ((void *)_paravirt_nop)
1375 1473
1376#ifdef CONFIG_SMP 1474#ifdef CONFIG_SMP
@@ -1420,12 +1518,37 @@ extern struct paravirt_patch_site __parainstructions[],
1420 __parainstructions_end[]; 1518 __parainstructions_end[];
1421 1519
1422#ifdef CONFIG_X86_32 1520#ifdef CONFIG_X86_32
1423#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;" 1521#define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
1424#define PV_RESTORE_REGS "popl %%edx; popl %%ecx" 1522#define PV_RESTORE_REGS "popl %edx; popl %ecx;"
1523
1524/* save and restore all caller-save registers, except return value */
1525#define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;"
1526#define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;"
1527
1425#define PV_FLAGS_ARG "0" 1528#define PV_FLAGS_ARG "0"
1426#define PV_EXTRA_CLOBBERS 1529#define PV_EXTRA_CLOBBERS
1427#define PV_VEXTRA_CLOBBERS 1530#define PV_VEXTRA_CLOBBERS
1428#else 1531#else
1532/* save and restore all caller-save registers, except return value */
1533#define PV_SAVE_ALL_CALLER_REGS \
1534 "push %rcx;" \
1535 "push %rdx;" \
1536 "push %rsi;" \
1537 "push %rdi;" \
1538 "push %r8;" \
1539 "push %r9;" \
1540 "push %r10;" \
1541 "push %r11;"
1542#define PV_RESTORE_ALL_CALLER_REGS \
1543 "pop %r11;" \
1544 "pop %r10;" \
1545 "pop %r9;" \
1546 "pop %r8;" \
1547 "pop %rdi;" \
1548 "pop %rsi;" \
1549 "pop %rdx;" \
1550 "pop %rcx;"
1551
1429/* We save some registers, but all of them, that's too much. We clobber all 1552/* We save some registers, but all of them, that's too much. We clobber all
1430 * caller saved registers but the argument parameter */ 1553 * caller saved registers but the argument parameter */
1431#define PV_SAVE_REGS "pushq %%rdi;" 1554#define PV_SAVE_REGS "pushq %%rdi;"
@@ -1435,52 +1558,76 @@ extern struct paravirt_patch_site __parainstructions[],
1435#define PV_FLAGS_ARG "D" 1558#define PV_FLAGS_ARG "D"
1436#endif 1559#endif
1437 1560
1561/*
1562 * Generate a thunk around a function which saves all caller-save
1563 * registers except for the return value. This allows C functions to
1564 * be called from assembler code where fewer than normal registers are
1565 * available. It may also help code generation around calls from C
1566 * code if the common case doesn't use many registers.
1567 *
1568 * When a callee is wrapped in a thunk, the caller can assume that all
1569 * arg regs and all scratch registers are preserved across the
1570 * call. The return value in rax/eax will not be saved, even for void
1571 * functions.
1572 */
1573#define PV_CALLEE_SAVE_REGS_THUNK(func) \
1574 extern typeof(func) __raw_callee_save_##func; \
1575 static void *__##func##__ __used = func; \
1576 \
1577 asm(".pushsection .text;" \
1578 "__raw_callee_save_" #func ": " \
1579 PV_SAVE_ALL_CALLER_REGS \
1580 "call " #func ";" \
1581 PV_RESTORE_ALL_CALLER_REGS \
1582 "ret;" \
1583 ".popsection")
1584
1585/* Get a reference to a callee-save function */
1586#define PV_CALLEE_SAVE(func) \
1587 ((struct paravirt_callee_save) { __raw_callee_save_##func })
1588
1589/* Promise that "func" already uses the right calling convention */
1590#define __PV_IS_CALLEE_SAVE(func) \
1591 ((struct paravirt_callee_save) { func })
1592
1438static inline unsigned long __raw_local_save_flags(void) 1593static inline unsigned long __raw_local_save_flags(void)
1439{ 1594{
1440 unsigned long f; 1595 unsigned long f;
1441 1596
1442 asm volatile(paravirt_alt(PV_SAVE_REGS 1597 asm volatile(paravirt_alt(PARAVIRT_CALL)
1443 PARAVIRT_CALL
1444 PV_RESTORE_REGS)
1445 : "=a"(f) 1598 : "=a"(f)
1446 : paravirt_type(pv_irq_ops.save_fl), 1599 : paravirt_type(pv_irq_ops.save_fl),
1447 paravirt_clobber(CLBR_EAX) 1600 paravirt_clobber(CLBR_EAX)
1448 : "memory", "cc" PV_VEXTRA_CLOBBERS); 1601 : "memory", "cc");
1449 return f; 1602 return f;
1450} 1603}
1451 1604
1452static inline void raw_local_irq_restore(unsigned long f) 1605static inline void raw_local_irq_restore(unsigned long f)
1453{ 1606{
1454 asm volatile(paravirt_alt(PV_SAVE_REGS 1607 asm volatile(paravirt_alt(PARAVIRT_CALL)
1455 PARAVIRT_CALL
1456 PV_RESTORE_REGS)
1457 : "=a"(f) 1608 : "=a"(f)
1458 : PV_FLAGS_ARG(f), 1609 : PV_FLAGS_ARG(f),
1459 paravirt_type(pv_irq_ops.restore_fl), 1610 paravirt_type(pv_irq_ops.restore_fl),
1460 paravirt_clobber(CLBR_EAX) 1611 paravirt_clobber(CLBR_EAX)
1461 : "memory", "cc" PV_EXTRA_CLOBBERS); 1612 : "memory", "cc");
1462} 1613}
1463 1614
1464static inline void raw_local_irq_disable(void) 1615static inline void raw_local_irq_disable(void)
1465{ 1616{
1466 asm volatile(paravirt_alt(PV_SAVE_REGS 1617 asm volatile(paravirt_alt(PARAVIRT_CALL)
1467 PARAVIRT_CALL
1468 PV_RESTORE_REGS)
1469 : 1618 :
1470 : paravirt_type(pv_irq_ops.irq_disable), 1619 : paravirt_type(pv_irq_ops.irq_disable),
1471 paravirt_clobber(CLBR_EAX) 1620 paravirt_clobber(CLBR_EAX)
1472 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); 1621 : "memory", "eax", "cc");
1473} 1622}
1474 1623
1475static inline void raw_local_irq_enable(void) 1624static inline void raw_local_irq_enable(void)
1476{ 1625{
1477 asm volatile(paravirt_alt(PV_SAVE_REGS 1626 asm volatile(paravirt_alt(PARAVIRT_CALL)
1478 PARAVIRT_CALL
1479 PV_RESTORE_REGS)
1480 : 1627 :
1481 : paravirt_type(pv_irq_ops.irq_enable), 1628 : paravirt_type(pv_irq_ops.irq_enable),
1482 paravirt_clobber(CLBR_EAX) 1629 paravirt_clobber(CLBR_EAX)
1483 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS); 1630 : "memory", "eax", "cc");
1484} 1631}
1485 1632
1486static inline unsigned long __raw_local_irq_save(void) 1633static inline unsigned long __raw_local_irq_save(void)
@@ -1523,33 +1670,49 @@ static inline unsigned long __raw_local_irq_save(void)
1523 .popsection 1670 .popsection
1524 1671
1525 1672
1673#define COND_PUSH(set, mask, reg) \
1674 .if ((~(set)) & mask); push %reg; .endif
1675#define COND_POP(set, mask, reg) \
1676 .if ((~(set)) & mask); pop %reg; .endif
1677
1526#ifdef CONFIG_X86_64 1678#ifdef CONFIG_X86_64
1527#define PV_SAVE_REGS \ 1679
1528 push %rax; \ 1680#define PV_SAVE_REGS(set) \
1529 push %rcx; \ 1681 COND_PUSH(set, CLBR_RAX, rax); \
1530 push %rdx; \ 1682 COND_PUSH(set, CLBR_RCX, rcx); \
1531 push %rsi; \ 1683 COND_PUSH(set, CLBR_RDX, rdx); \
1532 push %rdi; \ 1684 COND_PUSH(set, CLBR_RSI, rsi); \
1533 push %r8; \ 1685 COND_PUSH(set, CLBR_RDI, rdi); \
1534 push %r9; \ 1686 COND_PUSH(set, CLBR_R8, r8); \
1535 push %r10; \ 1687 COND_PUSH(set, CLBR_R9, r9); \
1536 push %r11 1688 COND_PUSH(set, CLBR_R10, r10); \
1537#define PV_RESTORE_REGS \ 1689 COND_PUSH(set, CLBR_R11, r11)
1538 pop %r11; \ 1690#define PV_RESTORE_REGS(set) \
1539 pop %r10; \ 1691 COND_POP(set, CLBR_R11, r11); \
1540 pop %r9; \ 1692 COND_POP(set, CLBR_R10, r10); \
1541 pop %r8; \ 1693 COND_POP(set, CLBR_R9, r9); \
1542 pop %rdi; \ 1694 COND_POP(set, CLBR_R8, r8); \
1543 pop %rsi; \ 1695 COND_POP(set, CLBR_RDI, rdi); \
1544 pop %rdx; \ 1696 COND_POP(set, CLBR_RSI, rsi); \
1545 pop %rcx; \ 1697 COND_POP(set, CLBR_RDX, rdx); \
1546 pop %rax 1698 COND_POP(set, CLBR_RCX, rcx); \
1699 COND_POP(set, CLBR_RAX, rax)
1700
1547#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8) 1701#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
1548#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8) 1702#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
1549#define PARA_INDIRECT(addr) *addr(%rip) 1703#define PARA_INDIRECT(addr) *addr(%rip)
1550#else 1704#else
1551#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx 1705#define PV_SAVE_REGS(set) \
1552#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax 1706 COND_PUSH(set, CLBR_EAX, eax); \
1707 COND_PUSH(set, CLBR_EDI, edi); \
1708 COND_PUSH(set, CLBR_ECX, ecx); \
1709 COND_PUSH(set, CLBR_EDX, edx)
1710#define PV_RESTORE_REGS(set) \
1711 COND_POP(set, CLBR_EDX, edx); \
1712 COND_POP(set, CLBR_ECX, ecx); \
1713 COND_POP(set, CLBR_EDI, edi); \
1714 COND_POP(set, CLBR_EAX, eax)
1715
1553#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4) 1716#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
1554#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4) 1717#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
1555#define PARA_INDIRECT(addr) *%cs:addr 1718#define PARA_INDIRECT(addr) *%cs:addr
@@ -1561,15 +1724,15 @@ static inline unsigned long __raw_local_irq_save(void)
1561 1724
1562#define DISABLE_INTERRUPTS(clobbers) \ 1725#define DISABLE_INTERRUPTS(clobbers) \
1563 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ 1726 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
1564 PV_SAVE_REGS; \ 1727 PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
1565 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \ 1728 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \
1566 PV_RESTORE_REGS;) \ 1729 PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
1567 1730
1568#define ENABLE_INTERRUPTS(clobbers) \ 1731#define ENABLE_INTERRUPTS(clobbers) \
1569 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ 1732 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
1570 PV_SAVE_REGS; \ 1733 PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
1571 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ 1734 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
1572 PV_RESTORE_REGS;) 1735 PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
1573 1736
1574#define USERGS_SYSRET32 \ 1737#define USERGS_SYSRET32 \
1575 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \ 1738 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
@@ -1599,11 +1762,15 @@ static inline unsigned long __raw_local_irq_save(void)
1599 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ 1762 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
1600 swapgs) 1763 swapgs)
1601 1764
1765/*
1766 * Note: swapgs is very special, and in practise is either going to be
1767 * implemented with a single "swapgs" instruction or something very
1768 * special. Either way, we don't need to save any registers for
1769 * it.
1770 */
1602#define SWAPGS \ 1771#define SWAPGS \
1603 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ 1772 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
1604 PV_SAVE_REGS; \ 1773 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs) \
1605 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \
1606 PV_RESTORE_REGS \
1607 ) 1774 )
1608 1775
1609#define GET_CR2_INTO_RCX \ 1776#define GET_CR2_INTO_RCX \
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 84afa0d4d717..d211f9551998 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -768,7 +768,8 @@ extern int sysenter_setup(void);
768extern struct desc_ptr early_gdt_descr; 768extern struct desc_ptr early_gdt_descr;
769 769
770extern void cpu_set_gdt(int); 770extern void cpu_set_gdt(int);
771extern void switch_to_new_gdt(void); 771extern void switch_to_new_gdt(int);
772extern void load_percpu_segment(int);
772extern void cpu_init(void); 773extern void cpu_init(void);
773 774
774static inline unsigned long get_debugctlmsr(void) 775static inline unsigned long get_debugctlmsr(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c4bdc7f00207..cbcdb796d47f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -296,23 +296,28 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
296 296
297__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 297__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
298 298
299void load_percpu_segment(int cpu)
300{
301#ifdef CONFIG_X86_32
302 loadsegment(fs, __KERNEL_PERCPU);
303#else
304 loadsegment(gs, 0);
305 wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
306#endif
307}
308
299/* Current gdt points %fs at the "master" per-cpu area: after this, 309/* Current gdt points %fs at the "master" per-cpu area: after this,
300 * it's on the real one. */ 310 * it's on the real one. */
301void switch_to_new_gdt(void) 311void switch_to_new_gdt(int cpu)
302{ 312{
303 struct desc_ptr gdt_descr; 313 struct desc_ptr gdt_descr;
304 int cpu = smp_processor_id();
305 314
306 gdt_descr.address = (long)get_cpu_gdt_table(cpu); 315 gdt_descr.address = (long)get_cpu_gdt_table(cpu);
307 gdt_descr.size = GDT_SIZE - 1; 316 gdt_descr.size = GDT_SIZE - 1;
308 load_gdt(&gdt_descr); 317 load_gdt(&gdt_descr);
309 /* Reload the per-cpu base */ 318 /* Reload the per-cpu base */
310#ifdef CONFIG_X86_32 319
311 loadsegment(fs, __KERNEL_PERCPU); 320 load_percpu_segment(cpu);
312#else
313 loadsegment(gs, 0);
314 wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
315#endif
316} 321}
317 322
318static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 323static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
@@ -1029,7 +1034,7 @@ void __cpuinit cpu_init(void)
1029 * and set up the GDT descriptor: 1034 * and set up the GDT descriptor:
1030 */ 1035 */
1031 1036
1032 switch_to_new_gdt(); 1037 switch_to_new_gdt(cpu);
1033 loadsegment(fs, 0); 1038 loadsegment(fs, 0);
1034 1039
1035 load_idt((const struct desc_ptr *)&idt_descr); 1040 load_idt((const struct desc_ptr *)&idt_descr);
@@ -1131,7 +1136,7 @@ void __cpuinit cpu_init(void)
1131 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1136 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1132 1137
1133 load_idt(&idt_descr); 1138 load_idt(&idt_descr);
1134 switch_to_new_gdt(); 1139 switch_to_new_gdt(cpu);
1135 1140
1136 /* 1141 /*
1137 * Set up and load the per-CPU TSS and LDT 1142 * Set up and load the per-CPU TSS and LDT
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1f7d697b5c00..fbcf96b295ff 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1143,7 +1143,7 @@ ENTRY(native_load_gs_index)
1143 CFI_STARTPROC 1143 CFI_STARTPROC
1144 pushf 1144 pushf
1145 CFI_ADJUST_CFA_OFFSET 8 1145 CFI_ADJUST_CFA_OFFSET 8
1146 DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) 1146 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1147 SWAPGS 1147 SWAPGS
1148gs_change: 1148gs_change:
1149 movl %edi,%gs 1149 movl %edi,%gs
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 202514be5923..cea11c8e3049 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -44,6 +44,17 @@ void _paravirt_nop(void)
44{ 44{
45} 45}
46 46
47/* identity function, which can be inlined */
48u32 _paravirt_ident_32(u32 x)
49{
50 return x;
51}
52
53u64 _paravirt_ident_64(u64 x)
54{
55 return x;
56}
57
47static void __init default_banner(void) 58static void __init default_banner(void)
48{ 59{
49 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 60 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -138,9 +149,16 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
138 if (opfunc == NULL) 149 if (opfunc == NULL)
139 /* If there's no function, patch it with a ud2a (BUG) */ 150 /* If there's no function, patch it with a ud2a (BUG) */
140 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); 151 ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a));
141 else if (opfunc == paravirt_nop) 152 else if (opfunc == _paravirt_nop)
142 /* If the operation is a nop, then nop the callsite */ 153 /* If the operation is a nop, then nop the callsite */
143 ret = paravirt_patch_nop(); 154 ret = paravirt_patch_nop();
155
156 /* identity functions just return their single argument */
157 else if (opfunc == _paravirt_ident_32)
158 ret = paravirt_patch_ident_32(insnbuf, len);
159 else if (opfunc == _paravirt_ident_64)
160 ret = paravirt_patch_ident_64(insnbuf, len);
161
144 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || 162 else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
145 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || 163 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
146 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || 164 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
@@ -292,10 +310,10 @@ struct pv_time_ops pv_time_ops = {
292 310
293struct pv_irq_ops pv_irq_ops = { 311struct pv_irq_ops pv_irq_ops = {
294 .init_IRQ = native_init_IRQ, 312 .init_IRQ = native_init_IRQ,
295 .save_fl = native_save_fl, 313 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
296 .restore_fl = native_restore_fl, 314 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
297 .irq_disable = native_irq_disable, 315 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
298 .irq_enable = native_irq_enable, 316 .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
299 .safe_halt = native_safe_halt, 317 .safe_halt = native_safe_halt,
300 .halt = native_halt, 318 .halt = native_halt,
301#ifdef CONFIG_X86_64 319#ifdef CONFIG_X86_64
@@ -373,6 +391,14 @@ struct pv_apic_ops pv_apic_ops = {
373#endif 391#endif
374}; 392};
375 393
394#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE)
395/* 32-bit pagetable entries */
396#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32)
397#else
398/* 64-bit pagetable entries */
399#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
400#endif
401
376struct pv_mmu_ops pv_mmu_ops = { 402struct pv_mmu_ops pv_mmu_ops = {
377#ifndef CONFIG_X86_64 403#ifndef CONFIG_X86_64
378 .pagetable_setup_start = native_pagetable_setup_start, 404 .pagetable_setup_start = native_pagetable_setup_start,
@@ -424,21 +450,23 @@ struct pv_mmu_ops pv_mmu_ops = {
424 .pmd_clear = native_pmd_clear, 450 .pmd_clear = native_pmd_clear,
425#endif 451#endif
426 .set_pud = native_set_pud, 452 .set_pud = native_set_pud,
427 .pmd_val = native_pmd_val, 453
428 .make_pmd = native_make_pmd, 454 .pmd_val = PTE_IDENT,
455 .make_pmd = PTE_IDENT,
429 456
430#if PAGETABLE_LEVELS == 4 457#if PAGETABLE_LEVELS == 4
431 .pud_val = native_pud_val, 458 .pud_val = PTE_IDENT,
432 .make_pud = native_make_pud, 459 .make_pud = PTE_IDENT,
460
433 .set_pgd = native_set_pgd, 461 .set_pgd = native_set_pgd,
434#endif 462#endif
435#endif /* PAGETABLE_LEVELS >= 3 */ 463#endif /* PAGETABLE_LEVELS >= 3 */
436 464
437 .pte_val = native_pte_val, 465 .pte_val = PTE_IDENT,
438 .pgd_val = native_pgd_val, 466 .pgd_val = PTE_IDENT,
439 467
440 .make_pte = native_make_pte, 468 .make_pte = PTE_IDENT,
441 .make_pgd = native_make_pgd, 469 .make_pgd = PTE_IDENT,
442 470
443 .dup_mmap = paravirt_nop, 471 .dup_mmap = paravirt_nop,
444 .exit_mmap = paravirt_nop, 472 .exit_mmap = paravirt_nop,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 9fe644f4861d..d9f32e6d6ab6 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,18 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
12DEF_NATIVE(pv_cpu_ops, clts, "clts"); 12DEF_NATIVE(pv_cpu_ops, clts, "clts");
13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); 13DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
14 14
15unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
16{
17 /* arg in %eax, return in %eax */
18 return 0;
19}
20
21unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
22{
23 /* arg in %edx:%eax, return in %edx:%eax */
24 return 0;
25}
26
15unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 27unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
16 unsigned long addr, unsigned len) 28 unsigned long addr, unsigned len)
17{ 29{
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 061d01df9ae6..3f08f34f93eb 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -19,6 +19,21 @@ DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); 19DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl");
20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); 20DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
21 21
22DEF_NATIVE(, mov32, "mov %edi, %eax");
23DEF_NATIVE(, mov64, "mov %rdi, %rax");
24
25unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
26{
27 return paravirt_patch_insns(insnbuf, len,
28 start__mov32, end__mov32);
29}
30
31unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
32{
33 return paravirt_patch_insns(insnbuf, len,
34 start__mov64, end__mov64);
35}
36
22unsigned native_patch(u8 type, u16 clobbers, void *ibuf, 37unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 unsigned long addr, unsigned len) 38 unsigned long addr, unsigned len)
24{ 39{
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 0d1e7ac439f4..ef91747bbed5 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -122,7 +122,7 @@ void __init setup_per_cpu_areas(void)
122 * area. Reload any changed state for the boot CPU. 122 * area. Reload any changed state for the boot CPU.
123 */ 123 */
124 if (cpu == boot_cpu_id) 124 if (cpu == boot_cpu_id)
125 switch_to_new_gdt(); 125 switch_to_new_gdt(cpu);
126 126
127 DBG("PERCPU: cpu %4d %p\n", cpu, ptr); 127 DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
128 } 128 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 96f7d304f5c9..af57f88186e7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1188,7 +1188,7 @@ out:
1188void __init native_smp_prepare_boot_cpu(void) 1188void __init native_smp_prepare_boot_cpu(void)
1189{ 1189{
1190 int me = smp_processor_id(); 1190 int me = smp_processor_id();
1191 switch_to_new_gdt(); 1191 switch_to_new_gdt(me);
1192 /* already set me in cpu_online_mask in boot_cpu_init() */ 1192 /* already set me in cpu_online_mask in boot_cpu_init() */
1193 cpumask_set_cpu(me, cpu_callout_mask); 1193 cpumask_set_cpu(me, cpu_callout_mask);
1194 per_cpu(cpu_state, me) = CPU_ONLINE; 1194 per_cpu(cpu_state, me) = CPU_ONLINE;
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 755ede02b13f..f396e61bcb34 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -259,7 +259,7 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
259 * the cpu's, all of which are still in the mask. 259 * the cpu's, all of which are still in the mask.
260 */ 260 */
261 __get_cpu_var(ptcstats).ptc_i++; 261 __get_cpu_var(ptcstats).ptc_i++;
262 return 0; 262 return flush_mask;
263 } 263 }
264 264
265 /* 265 /*
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 1d3302cc2ddf..eb9e7347928e 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -670,10 +670,11 @@ static inline int __init activate_vmi(void)
670 para_fill(pv_mmu_ops.write_cr2, SetCR2); 670 para_fill(pv_mmu_ops.write_cr2, SetCR2);
671 para_fill(pv_mmu_ops.write_cr3, SetCR3); 671 para_fill(pv_mmu_ops.write_cr3, SetCR3);
672 para_fill(pv_cpu_ops.write_cr4, SetCR4); 672 para_fill(pv_cpu_ops.write_cr4, SetCR4);
673 para_fill(pv_irq_ops.save_fl, GetInterruptMask); 673
674 para_fill(pv_irq_ops.restore_fl, SetInterruptMask); 674 para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
675 para_fill(pv_irq_ops.irq_disable, DisableInterrupts); 675 para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
676 para_fill(pv_irq_ops.irq_enable, EnableInterrupts); 676 para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
677 para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
677 678
678 para_fill(pv_cpu_ops.wbinvd, WBINVD); 679 para_fill(pv_cpu_ops.wbinvd, WBINVD);
679 para_fill(pv_cpu_ops.read_tsc, RDTSC); 680 para_fill(pv_cpu_ops.read_tsc, RDTSC);
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index c9740996430a..07f62d287ff0 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -22,6 +22,7 @@ PHDRS {
22#ifdef CONFIG_SMP 22#ifdef CONFIG_SMP
23 percpu PT_LOAD FLAGS(7); /* RWE */ 23 percpu PT_LOAD FLAGS(7); /* RWE */
24#endif 24#endif
25 data.init2 PT_LOAD FLAGS(7); /* RWE */
25 note PT_NOTE FLAGS(0); /* ___ */ 26 note PT_NOTE FLAGS(0); /* ___ */
26} 27}
27SECTIONS 28SECTIONS
@@ -215,7 +216,7 @@ SECTIONS
215 /* 216 /*
216 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the 217 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
217 * output PHDR, so the next output section - __data_nosave - should 218 * output PHDR, so the next output section - __data_nosave - should
218 * switch it back to data.init. Also, pda should be at the head of 219 * start another section data.init2. Also, pda should be at the head of
219 * percpu area. Preallocate it and define the percpu offset symbol 220 * percpu area. Preallocate it and define the percpu offset symbol
220 * so that it can be accessed as a percpu variable. 221 * so that it can be accessed as a percpu variable.
221 */ 222 */
@@ -232,7 +233,7 @@ SECTIONS
232 __nosave_begin = .; 233 __nosave_begin = .;
233 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { 234 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
234 *(.data.nosave) 235 *(.data.nosave)
235 } :data.init /* switch back to data.init, see PERCPU_VADDR() above */ 236 } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
236 . = ALIGN(PAGE_SIZE); 237 . = ALIGN(PAGE_SIZE);
237 __nosave_end = .; 238 __nosave_end = .;
238 239
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a688f3bfaec2..c609205df594 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void)
37 flags &= ~X86_EFLAGS_IF; 37 flags &= ~X86_EFLAGS_IF;
38 return flags; 38 return flags;
39} 39}
40PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
40 41
41static void vsmp_restore_fl(unsigned long flags) 42static void vsmp_restore_fl(unsigned long flags)
42{ 43{
@@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned long flags)
46 flags |= X86_EFLAGS_AC; 47 flags |= X86_EFLAGS_AC;
47 native_restore_fl(flags); 48 native_restore_fl(flags);
48} 49}
50PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
49 51
50static void vsmp_irq_disable(void) 52static void vsmp_irq_disable(void)
51{ 53{
@@ -53,6 +55,7 @@ static void vsmp_irq_disable(void)
53 55
54 native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); 56 native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
55} 57}
58PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
56 59
57static void vsmp_irq_enable(void) 60static void vsmp_irq_enable(void)
58{ 61{
@@ -60,6 +63,7 @@ static void vsmp_irq_enable(void)
60 63
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 64 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 65}
66PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
63 67
64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, 68static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 69 unsigned long addr, unsigned len)
@@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void)
90 cap, ctl); 94 cap, ctl);
91 if (cap & ctl & (1 << 4)) { 95 if (cap & ctl & (1 << 4)) {
92 /* Setup irq ops and turn on vSMP IRQ fastpath handling */ 96 /* Setup irq ops and turn on vSMP IRQ fastpath handling */
93 pv_irq_ops.irq_disable = vsmp_irq_disable; 97 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
94 pv_irq_ops.irq_enable = vsmp_irq_enable; 98 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable);
95 pv_irq_ops.save_fl = vsmp_save_fl; 99 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
96 pv_irq_ops.restore_fl = vsmp_restore_fl; 100 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
97 pv_init_ops.patch = vsmp_patch; 101 pv_init_ops.patch = vsmp_patch;
98 102
99 ctl &= ~(1 << 4); 103 ctl &= ~(1 << 4);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 92f1c6f3e19d..19e33b6cd593 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -173,24 +173,29 @@ static unsigned long save_fl(void)
173{ 173{
174 return lguest_data.irq_enabled; 174 return lguest_data.irq_enabled;
175} 175}
176PV_CALLEE_SAVE_REGS_THUNK(save_fl);
176 177
177/* restore_flags() just sets the flags back to the value given. */ 178/* restore_flags() just sets the flags back to the value given. */
178static void restore_fl(unsigned long flags) 179static void restore_fl(unsigned long flags)
179{ 180{
180 lguest_data.irq_enabled = flags; 181 lguest_data.irq_enabled = flags;
181} 182}
183PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
182 184
183/* Interrupts go off... */ 185/* Interrupts go off... */
184static void irq_disable(void) 186static void irq_disable(void)
185{ 187{
186 lguest_data.irq_enabled = 0; 188 lguest_data.irq_enabled = 0;
187} 189}
190PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
188 191
189/* Interrupts go on... */ 192/* Interrupts go on... */
190static void irq_enable(void) 193static void irq_enable(void)
191{ 194{
192 lguest_data.irq_enabled = X86_EFLAGS_IF; 195 lguest_data.irq_enabled = X86_EFLAGS_IF;
193} 196}
197PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
198
194/*:*/ 199/*:*/
195/*M:003 Note that we don't check for outstanding interrupts when we re-enable 200/*M:003 Note that we don't check for outstanding interrupts when we re-enable
196 * them (or when we unmask an interrupt). This seems to work for the moment, 201 * them (or when we unmask an interrupt). This seems to work for the moment,
@@ -984,10 +989,10 @@ __init void lguest_init(void)
984 989
985 /* interrupt-related operations */ 990 /* interrupt-related operations */
986 pv_irq_ops.init_IRQ = lguest_init_IRQ; 991 pv_irq_ops.init_IRQ = lguest_init_IRQ;
987 pv_irq_ops.save_fl = save_fl; 992 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
988 pv_irq_ops.restore_fl = restore_fl; 993 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
989 pv_irq_ops.irq_disable = irq_disable; 994 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
990 pv_irq_ops.irq_enable = irq_enable; 995 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
991 pv_irq_ops.safe_halt = lguest_safe_halt; 996 pv_irq_ops.safe_halt = lguest_safe_halt;
992 997
993 /* init-time operations */ 998 /* init-time operations */
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 328cb0ce62f0..6f5a38c7f900 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -1744,13 +1744,13 @@ static void __init voyager_smp_prepare_cpus(unsigned int max_cpus)
1744 1744
1745static void __cpuinit voyager_smp_prepare_boot_cpu(void) 1745static void __cpuinit voyager_smp_prepare_boot_cpu(void)
1746{ 1746{
1747 switch_to_new_gdt(); 1747 int cpu = smp_processor_id();
1748 switch_to_new_gdt(cpu);
1748 1749
1749 cpu_online_map = cpumask_of_cpu(smp_processor_id()); 1750 cpu_online_map = cpumask_of_cpu(smp_processor_id());
1750 cpu_callout_map = cpumask_of_cpu(smp_processor_id()); 1751 cpu_callout_map = cpumask_of_cpu(smp_processor_id());
1751 cpu_callin_map = CPU_MASK_NONE; 1752 cpu_callin_map = CPU_MASK_NONE;
1752 cpu_present_map = cpumask_of_cpu(smp_processor_id()); 1753 cpu_present_map = cpumask_of_cpu(smp_processor_id());
1753
1754} 1754}
1755 1755
1756static int __cpuinit voyager_cpu_up(unsigned int cpu) 1756static int __cpuinit voyager_cpu_up(unsigned int cpu)
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 6dcefba7836f..3b767d03fd6a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -6,7 +6,8 @@ CFLAGS_REMOVE_irq.o = -pg
6endif 6endif
7 7
8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
9 time.o xen-asm_$(BITS).o grant-table.o suspend.o 9 time.o xen-asm.o xen-asm_$(BITS).o \
10 grant-table.o suspend.o
10 11
11obj-$(CONFIG_SMP) += smp.o spinlock.o 12obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file 13obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 6b3f7eef57e3..37230342c2c4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -61,40 +61,13 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
61enum xen_domain_type xen_domain_type = XEN_NATIVE; 61enum xen_domain_type xen_domain_type = XEN_NATIVE;
62EXPORT_SYMBOL_GPL(xen_domain_type); 62EXPORT_SYMBOL_GPL(xen_domain_type);
63 63
64/*
65 * Identity map, in addition to plain kernel map. This needs to be
66 * large enough to allocate page table pages to allocate the rest.
67 * Each page can map 2MB.
68 */
69static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
70
71#ifdef CONFIG_X86_64
72/* l3 pud for userspace vsyscall mapping */
73static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
74#endif /* CONFIG_X86_64 */
75
76/*
77 * Note about cr3 (pagetable base) values:
78 *
79 * xen_cr3 contains the current logical cr3 value; it contains the
80 * last set cr3. This may not be the current effective cr3, because
81 * its update may be being lazily deferred. However, a vcpu looking
82 * at its own cr3 can use this value knowing that it everything will
83 * be self-consistent.
84 *
85 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
86 * hypercall to set the vcpu cr3 is complete (so it may be a little
87 * out of date, but it will never be set early). If one vcpu is
88 * looking at another vcpu's cr3 value, it should use this variable.
89 */
90DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
91DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
92
93struct start_info *xen_start_info; 64struct start_info *xen_start_info;
94EXPORT_SYMBOL_GPL(xen_start_info); 65EXPORT_SYMBOL_GPL(xen_start_info);
95 66
96struct shared_info xen_dummy_shared_info; 67struct shared_info xen_dummy_shared_info;
97 68
69void *xen_initial_gdt;
70
98/* 71/*
99 * Point at some empty memory to start with. We map the real shared_info 72 * Point at some empty memory to start with. We map the real shared_info
100 * page as soon as fixmap is up and running. 73 * page as soon as fixmap is up and running.
@@ -114,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
114 * 87 *
115 * 0: not available, 1: available 88 * 0: not available, 1: available
116 */ 89 */
117static int have_vcpu_info_placement = 90static int have_vcpu_info_placement = 1;
118#ifdef CONFIG_X86_32
119 1
120#else
121 0
122#endif
123 ;
124
125 91
126static void xen_vcpu_setup(int cpu) 92static void xen_vcpu_setup(int cpu)
127{ 93{
@@ -237,7 +203,7 @@ static unsigned long xen_get_debugreg(int reg)
237 return HYPERVISOR_get_debugreg(reg); 203 return HYPERVISOR_get_debugreg(reg);
238} 204}
239 205
240static void xen_leave_lazy(void) 206void xen_leave_lazy(void)
241{ 207{
242 paravirt_leave_lazy(paravirt_get_lazy_mode()); 208 paravirt_leave_lazy(paravirt_get_lazy_mode());
243 xen_mc_flush(); 209 xen_mc_flush();
@@ -598,76 +564,6 @@ static struct apic_ops xen_basic_apic_ops = {
598 564
599#endif 565#endif
600 566
601static void xen_flush_tlb(void)
602{
603 struct mmuext_op *op;
604 struct multicall_space mcs;
605
606 preempt_disable();
607
608 mcs = xen_mc_entry(sizeof(*op));
609
610 op = mcs.args;
611 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
612 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
613
614 xen_mc_issue(PARAVIRT_LAZY_MMU);
615
616 preempt_enable();
617}
618
619static void xen_flush_tlb_single(unsigned long addr)
620{
621 struct mmuext_op *op;
622 struct multicall_space mcs;
623
624 preempt_disable();
625
626 mcs = xen_mc_entry(sizeof(*op));
627 op = mcs.args;
628 op->cmd = MMUEXT_INVLPG_LOCAL;
629 op->arg1.linear_addr = addr & PAGE_MASK;
630 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
631
632 xen_mc_issue(PARAVIRT_LAZY_MMU);
633
634 preempt_enable();
635}
636
637static void xen_flush_tlb_others(const struct cpumask *cpus,
638 struct mm_struct *mm, unsigned long va)
639{
640 struct {
641 struct mmuext_op op;
642 DECLARE_BITMAP(mask, NR_CPUS);
643 } *args;
644 struct multicall_space mcs;
645
646 BUG_ON(cpumask_empty(cpus));
647 BUG_ON(!mm);
648
649 mcs = xen_mc_entry(sizeof(*args));
650 args = mcs.args;
651 args->op.arg2.vcpumask = to_cpumask(args->mask);
652
653 /* Remove us, and any offline CPUS. */
654 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
655 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
656 if (unlikely(cpumask_empty(to_cpumask(args->mask))))
657 goto issue;
658
659 if (va == TLB_FLUSH_ALL) {
660 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
661 } else {
662 args->op.cmd = MMUEXT_INVLPG_MULTI;
663 args->op.arg1.linear_addr = va;
664 }
665
666 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
667
668issue:
669 xen_mc_issue(PARAVIRT_LAZY_MMU);
670}
671 567
672static void xen_clts(void) 568static void xen_clts(void)
673{ 569{
@@ -693,21 +589,6 @@ static void xen_write_cr0(unsigned long cr0)
693 xen_mc_issue(PARAVIRT_LAZY_CPU); 589 xen_mc_issue(PARAVIRT_LAZY_CPU);
694} 590}
695 591
696static void xen_write_cr2(unsigned long cr2)
697{
698 percpu_read(xen_vcpu)->arch.cr2 = cr2;
699}
700
701static unsigned long xen_read_cr2(void)
702{
703 return percpu_read(xen_vcpu)->arch.cr2;
704}
705
706static unsigned long xen_read_cr2_direct(void)
707{
708 return percpu_read(xen_vcpu_info.arch.cr2);
709}
710
711static void xen_write_cr4(unsigned long cr4) 592static void xen_write_cr4(unsigned long cr4)
712{ 593{
713 cr4 &= ~X86_CR4_PGE; 594 cr4 &= ~X86_CR4_PGE;
@@ -716,71 +597,6 @@ static void xen_write_cr4(unsigned long cr4)
716 native_write_cr4(cr4); 597 native_write_cr4(cr4);
717} 598}
718 599
719static unsigned long xen_read_cr3(void)
720{
721 return percpu_read(xen_cr3);
722}
723
724static void set_current_cr3(void *v)
725{
726 percpu_write(xen_current_cr3, (unsigned long)v);
727}
728
729static void __xen_write_cr3(bool kernel, unsigned long cr3)
730{
731 struct mmuext_op *op;
732 struct multicall_space mcs;
733 unsigned long mfn;
734
735 if (cr3)
736 mfn = pfn_to_mfn(PFN_DOWN(cr3));
737 else
738 mfn = 0;
739
740 WARN_ON(mfn == 0 && kernel);
741
742 mcs = __xen_mc_entry(sizeof(*op));
743
744 op = mcs.args;
745 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
746 op->arg1.mfn = mfn;
747
748 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
749
750 if (kernel) {
751 percpu_write(xen_cr3, cr3);
752
753 /* Update xen_current_cr3 once the batch has actually
754 been submitted. */
755 xen_mc_callback(set_current_cr3, (void *)cr3);
756 }
757}
758
759static void xen_write_cr3(unsigned long cr3)
760{
761 BUG_ON(preemptible());
762
763 xen_mc_batch(); /* disables interrupts */
764
765 /* Update while interrupts are disabled, so its atomic with
766 respect to ipis */
767 percpu_write(xen_cr3, cr3);
768
769 __xen_write_cr3(true, cr3);
770
771#ifdef CONFIG_X86_64
772 {
773 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
774 if (user_pgd)
775 __xen_write_cr3(false, __pa(user_pgd));
776 else
777 __xen_write_cr3(false, 0);
778 }
779#endif
780
781 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
782}
783
784static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) 600static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
785{ 601{
786 int ret; 602 int ret;
@@ -822,185 +638,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
822 return ret; 638 return ret;
823} 639}
824 640
825/* Early in boot, while setting up the initial pagetable, assume
826 everything is pinned. */
827static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
828{
829#ifdef CONFIG_FLATMEM
830 BUG_ON(mem_map); /* should only be used early */
831#endif
832 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
833}
834
835/* Early release_pte assumes that all pts are pinned, since there's
836 only init_mm and anything attached to that is pinned. */
837static void xen_release_pte_init(unsigned long pfn)
838{
839 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
840}
841
842static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
843{
844 struct mmuext_op op;
845 op.cmd = cmd;
846 op.arg1.mfn = pfn_to_mfn(pfn);
847 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
848 BUG();
849}
850
851/* This needs to make sure the new pte page is pinned iff its being
852 attached to a pinned pagetable. */
853static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
854{
855 struct page *page = pfn_to_page(pfn);
856
857 if (PagePinned(virt_to_page(mm->pgd))) {
858 SetPagePinned(page);
859
860 vm_unmap_aliases();
861 if (!PageHighMem(page)) {
862 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
863 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
864 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
865 } else {
866 /* make sure there are no stray mappings of
867 this page */
868 kmap_flush_unused();
869 }
870 }
871}
872
873static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
874{
875 xen_alloc_ptpage(mm, pfn, PT_PTE);
876}
877
878static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
879{
880 xen_alloc_ptpage(mm, pfn, PT_PMD);
881}
882
883static int xen_pgd_alloc(struct mm_struct *mm)
884{
885 pgd_t *pgd = mm->pgd;
886 int ret = 0;
887
888 BUG_ON(PagePinned(virt_to_page(pgd)));
889
890#ifdef CONFIG_X86_64
891 {
892 struct page *page = virt_to_page(pgd);
893 pgd_t *user_pgd;
894
895 BUG_ON(page->private != 0);
896
897 ret = -ENOMEM;
898
899 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
900 page->private = (unsigned long)user_pgd;
901
902 if (user_pgd != NULL) {
903 user_pgd[pgd_index(VSYSCALL_START)] =
904 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
905 ret = 0;
906 }
907
908 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
909 }
910#endif
911
912 return ret;
913}
914
915static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
916{
917#ifdef CONFIG_X86_64
918 pgd_t *user_pgd = xen_get_user_pgd(pgd);
919
920 if (user_pgd)
921 free_page((unsigned long)user_pgd);
922#endif
923}
924
925/* This should never happen until we're OK to use struct page */
926static void xen_release_ptpage(unsigned long pfn, unsigned level)
927{
928 struct page *page = pfn_to_page(pfn);
929
930 if (PagePinned(page)) {
931 if (!PageHighMem(page)) {
932 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
933 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
934 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
935 }
936 ClearPagePinned(page);
937 }
938}
939
940static void xen_release_pte(unsigned long pfn)
941{
942 xen_release_ptpage(pfn, PT_PTE);
943}
944
945static void xen_release_pmd(unsigned long pfn)
946{
947 xen_release_ptpage(pfn, PT_PMD);
948}
949
950#if PAGETABLE_LEVELS == 4
951static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
952{
953 xen_alloc_ptpage(mm, pfn, PT_PUD);
954}
955
956static void xen_release_pud(unsigned long pfn)
957{
958 xen_release_ptpage(pfn, PT_PUD);
959}
960#endif
961
962#ifdef CONFIG_HIGHPTE
963static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
964{
965 pgprot_t prot = PAGE_KERNEL;
966
967 if (PagePinned(page))
968 prot = PAGE_KERNEL_RO;
969
970 if (0 && PageHighMem(page))
971 printk("mapping highpte %lx type %d prot %s\n",
972 page_to_pfn(page), type,
973 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
974
975 return kmap_atomic_prot(page, type, prot);
976}
977#endif
978
979#ifdef CONFIG_X86_32
980static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
981{
982 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
983 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
984 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
985 pte_val_ma(pte));
986
987 return pte;
988}
989
990/* Init-time set_pte while constructing initial pagetables, which
991 doesn't allow RO pagetable pages to be remapped RW */
992static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
993{
994 pte = mask_rw_pte(ptep, pte);
995
996 xen_set_pte(ptep, pte);
997}
998#endif
999
1000static __init void xen_pagetable_setup_start(pgd_t *base)
1001{
1002}
1003
1004void xen_setup_shared_info(void) 641void xen_setup_shared_info(void)
1005{ 642{
1006 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 643 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1021,37 +658,6 @@ void xen_setup_shared_info(void)
1021 xen_setup_mfn_list_list(); 658 xen_setup_mfn_list_list();
1022} 659}
1023 660
1024static __init void xen_pagetable_setup_done(pgd_t *base)
1025{
1026 xen_setup_shared_info();
1027}
1028
1029static __init void xen_post_allocator_init(void)
1030{
1031 pv_mmu_ops.set_pte = xen_set_pte;
1032 pv_mmu_ops.set_pmd = xen_set_pmd;
1033 pv_mmu_ops.set_pud = xen_set_pud;
1034#if PAGETABLE_LEVELS == 4
1035 pv_mmu_ops.set_pgd = xen_set_pgd;
1036#endif
1037
1038 /* This will work as long as patching hasn't happened yet
1039 (which it hasn't) */
1040 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1041 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1042 pv_mmu_ops.release_pte = xen_release_pte;
1043 pv_mmu_ops.release_pmd = xen_release_pmd;
1044#if PAGETABLE_LEVELS == 4
1045 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1046 pv_mmu_ops.release_pud = xen_release_pud;
1047#endif
1048
1049#ifdef CONFIG_X86_64
1050 SetPagePinned(virt_to_page(level3_user_vsyscall));
1051#endif
1052 xen_mark_init_mm_pinned();
1053}
1054
1055/* This is called once we have the cpu_possible_map */ 661/* This is called once we have the cpu_possible_map */
1056void xen_setup_vcpu_info_placement(void) 662void xen_setup_vcpu_info_placement(void)
1057{ 663{
@@ -1065,10 +671,10 @@ void xen_setup_vcpu_info_placement(void)
1065 if (have_vcpu_info_placement) { 671 if (have_vcpu_info_placement) {
1066 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 672 printk(KERN_INFO "Xen: using vcpu_info placement\n");
1067 673
1068 pv_irq_ops.save_fl = xen_save_fl_direct; 674 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
1069 pv_irq_ops.restore_fl = xen_restore_fl_direct; 675 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
1070 pv_irq_ops.irq_disable = xen_irq_disable_direct; 676 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
1071 pv_irq_ops.irq_enable = xen_irq_enable_direct; 677 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
1072 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 678 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
1073 } 679 }
1074} 680}
@@ -1126,49 +732,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1126 return ret; 732 return ret;
1127} 733}
1128 734
1129static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1130{
1131 pte_t pte;
1132
1133 phys >>= PAGE_SHIFT;
1134
1135 switch (idx) {
1136 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1137#ifdef CONFIG_X86_F00F_BUG
1138 case FIX_F00F_IDT:
1139#endif
1140#ifdef CONFIG_X86_32
1141 case FIX_WP_TEST:
1142 case FIX_VDSO:
1143# ifdef CONFIG_HIGHMEM
1144 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1145# endif
1146#else
1147 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1148#endif
1149#ifdef CONFIG_X86_LOCAL_APIC
1150 case FIX_APIC_BASE: /* maps dummy local APIC */
1151#endif
1152 pte = pfn_pte(phys, prot);
1153 break;
1154
1155 default:
1156 pte = mfn_pte(phys, prot);
1157 break;
1158 }
1159
1160 __native_set_fixmap(idx, pte);
1161
1162#ifdef CONFIG_X86_64
1163 /* Replicate changes to map the vsyscall page into the user
1164 pagetable vsyscall mapping. */
1165 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1166 unsigned long vaddr = __fix_to_virt(idx);
1167 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1168 }
1169#endif
1170}
1171
1172static const struct pv_info xen_info __initdata = { 735static const struct pv_info xen_info __initdata = {
1173 .paravirt_enabled = 1, 736 .paravirt_enabled = 1,
1174 .shared_kernel_pmd = 0, 737 .shared_kernel_pmd = 0,
@@ -1264,86 +827,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = {
1264#endif 827#endif
1265}; 828};
1266 829
1267static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1268 .pagetable_setup_start = xen_pagetable_setup_start,
1269 .pagetable_setup_done = xen_pagetable_setup_done,
1270
1271 .read_cr2 = xen_read_cr2,
1272 .write_cr2 = xen_write_cr2,
1273
1274 .read_cr3 = xen_read_cr3,
1275 .write_cr3 = xen_write_cr3,
1276
1277 .flush_tlb_user = xen_flush_tlb,
1278 .flush_tlb_kernel = xen_flush_tlb,
1279 .flush_tlb_single = xen_flush_tlb_single,
1280 .flush_tlb_others = xen_flush_tlb_others,
1281
1282 .pte_update = paravirt_nop,
1283 .pte_update_defer = paravirt_nop,
1284
1285 .pgd_alloc = xen_pgd_alloc,
1286 .pgd_free = xen_pgd_free,
1287
1288 .alloc_pte = xen_alloc_pte_init,
1289 .release_pte = xen_release_pte_init,
1290 .alloc_pmd = xen_alloc_pte_init,
1291 .alloc_pmd_clone = paravirt_nop,
1292 .release_pmd = xen_release_pte_init,
1293
1294#ifdef CONFIG_HIGHPTE
1295 .kmap_atomic_pte = xen_kmap_atomic_pte,
1296#endif
1297
1298#ifdef CONFIG_X86_64
1299 .set_pte = xen_set_pte,
1300#else
1301 .set_pte = xen_set_pte_init,
1302#endif
1303 .set_pte_at = xen_set_pte_at,
1304 .set_pmd = xen_set_pmd_hyper,
1305
1306 .ptep_modify_prot_start = __ptep_modify_prot_start,
1307 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1308
1309 .pte_val = xen_pte_val,
1310 .pgd_val = xen_pgd_val,
1311
1312 .make_pte = xen_make_pte,
1313 .make_pgd = xen_make_pgd,
1314
1315#ifdef CONFIG_X86_PAE
1316 .set_pte_atomic = xen_set_pte_atomic,
1317 .set_pte_present = xen_set_pte_at,
1318 .pte_clear = xen_pte_clear,
1319 .pmd_clear = xen_pmd_clear,
1320#endif /* CONFIG_X86_PAE */
1321 .set_pud = xen_set_pud_hyper,
1322
1323 .make_pmd = xen_make_pmd,
1324 .pmd_val = xen_pmd_val,
1325
1326#if PAGETABLE_LEVELS == 4
1327 .pud_val = xen_pud_val,
1328 .make_pud = xen_make_pud,
1329 .set_pgd = xen_set_pgd_hyper,
1330
1331 .alloc_pud = xen_alloc_pte_init,
1332 .release_pud = xen_release_pte_init,
1333#endif /* PAGETABLE_LEVELS == 4 */
1334
1335 .activate_mm = xen_activate_mm,
1336 .dup_mmap = xen_dup_mmap,
1337 .exit_mmap = xen_exit_mmap,
1338
1339 .lazy_mode = {
1340 .enter = paravirt_enter_lazy_mmu,
1341 .leave = xen_leave_lazy,
1342 },
1343
1344 .set_fixmap = xen_set_fixmap,
1345};
1346
1347static void xen_reboot(int reason) 830static void xen_reboot(int reason)
1348{ 831{
1349 struct sched_shutdown r = { .reason = reason }; 832 struct sched_shutdown r = { .reason = reason };
@@ -1386,223 +869,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
1386}; 869};
1387 870
1388 871
1389static void __init xen_reserve_top(void)
1390{
1391#ifdef CONFIG_X86_32
1392 unsigned long top = HYPERVISOR_VIRT_START;
1393 struct xen_platform_parameters pp;
1394
1395 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1396 top = pp.virt_start;
1397
1398 reserve_top_address(-top);
1399#endif /* CONFIG_X86_32 */
1400}
1401
1402/*
1403 * Like __va(), but returns address in the kernel mapping (which is
1404 * all we have until the physical memory mapping has been set up.
1405 */
1406static void *__ka(phys_addr_t paddr)
1407{
1408#ifdef CONFIG_X86_64
1409 return (void *)(paddr + __START_KERNEL_map);
1410#else
1411 return __va(paddr);
1412#endif
1413}
1414
1415/* Convert a machine address to physical address */
1416static unsigned long m2p(phys_addr_t maddr)
1417{
1418 phys_addr_t paddr;
1419
1420 maddr &= PTE_PFN_MASK;
1421 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1422
1423 return paddr;
1424}
1425
1426/* Convert a machine address to kernel virtual */
1427static void *m2v(phys_addr_t maddr)
1428{
1429 return __ka(m2p(maddr));
1430}
1431
1432static void set_page_prot(void *addr, pgprot_t prot)
1433{
1434 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1435 pte_t pte = pfn_pte(pfn, prot);
1436
1437 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1438 BUG();
1439}
1440
1441static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1442{
1443 unsigned pmdidx, pteidx;
1444 unsigned ident_pte;
1445 unsigned long pfn;
1446
1447 ident_pte = 0;
1448 pfn = 0;
1449 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1450 pte_t *pte_page;
1451
1452 /* Reuse or allocate a page of ptes */
1453 if (pmd_present(pmd[pmdidx]))
1454 pte_page = m2v(pmd[pmdidx].pmd);
1455 else {
1456 /* Check for free pte pages */
1457 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1458 break;
1459
1460 pte_page = &level1_ident_pgt[ident_pte];
1461 ident_pte += PTRS_PER_PTE;
1462
1463 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1464 }
1465
1466 /* Install mappings */
1467 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1468 pte_t pte;
1469
1470 if (pfn > max_pfn_mapped)
1471 max_pfn_mapped = pfn;
1472
1473 if (!pte_none(pte_page[pteidx]))
1474 continue;
1475
1476 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1477 pte_page[pteidx] = pte;
1478 }
1479 }
1480
1481 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1482 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1483
1484 set_page_prot(pmd, PAGE_KERNEL_RO);
1485}
1486
1487#ifdef CONFIG_X86_64
1488static void convert_pfn_mfn(void *v)
1489{
1490 pte_t *pte = v;
1491 int i;
1492
1493 /* All levels are converted the same way, so just treat them
1494 as ptes. */
1495 for (i = 0; i < PTRS_PER_PTE; i++)
1496 pte[i] = xen_make_pte(pte[i].pte);
1497}
1498
1499/*
1500 * Set up the inital kernel pagetable.
1501 *
1502 * We can construct this by grafting the Xen provided pagetable into
1503 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1504 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1505 * means that only the kernel has a physical mapping to start with -
1506 * but that's enough to get __va working. We need to fill in the rest
1507 * of the physical mapping once some sort of allocator has been set
1508 * up.
1509 */
1510static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1511 unsigned long max_pfn)
1512{
1513 pud_t *l3;
1514 pmd_t *l2;
1515
1516 /* Zap identity mapping */
1517 init_level4_pgt[0] = __pgd(0);
1518
1519 /* Pre-constructed entries are in pfn, so convert to mfn */
1520 convert_pfn_mfn(init_level4_pgt);
1521 convert_pfn_mfn(level3_ident_pgt);
1522 convert_pfn_mfn(level3_kernel_pgt);
1523
1524 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1525 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1526
1527 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1528 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1529
1530 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1531 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1532 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1533
1534 /* Set up identity map */
1535 xen_map_identity_early(level2_ident_pgt, max_pfn);
1536
1537 /* Make pagetable pieces RO */
1538 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1539 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1540 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1541 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1542 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1543 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1544
1545 /* Pin down new L4 */
1546 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1547 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1548
1549 /* Unpin Xen-provided one */
1550 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1551
1552 /* Switch over */
1553 pgd = init_level4_pgt;
1554
1555 /*
1556 * At this stage there can be no user pgd, and no page
1557 * structure to attach it to, so make sure we just set kernel
1558 * pgd.
1559 */
1560 xen_mc_batch();
1561 __xen_write_cr3(true, __pa(pgd));
1562 xen_mc_issue(PARAVIRT_LAZY_CPU);
1563
1564 reserve_early(__pa(xen_start_info->pt_base),
1565 __pa(xen_start_info->pt_base +
1566 xen_start_info->nr_pt_frames * PAGE_SIZE),
1567 "XEN PAGETABLES");
1568
1569 return pgd;
1570}
1571#else /* !CONFIG_X86_64 */
1572static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1573
1574static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1575 unsigned long max_pfn)
1576{
1577 pmd_t *kernel_pmd;
1578
1579 init_pg_tables_start = __pa(pgd);
1580 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1581 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1582
1583 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1584 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1585
1586 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1587
1588 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1589 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1590 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1591
1592 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1593 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1594 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1595
1596 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1597
1598 xen_write_cr3(__pa(swapper_pg_dir));
1599
1600 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1601
1602 return swapper_pg_dir;
1603}
1604#endif /* CONFIG_X86_64 */
1605
1606/* First C function to be called on Xen boot */ 872/* First C function to be called on Xen boot */
1607asmlinkage void __init xen_start_kernel(void) 873asmlinkage void __init xen_start_kernel(void)
1608{ 874{
@@ -1642,9 +908,18 @@ asmlinkage void __init xen_start_kernel(void)
1642 machine_ops = xen_machine_ops; 908 machine_ops = xen_machine_ops;
1643 909
1644#ifdef CONFIG_X86_64 910#ifdef CONFIG_X86_64
1645 /* Disable until direct per-cpu data access. */ 911 /*
1646 have_vcpu_info_placement = 0; 912 * Setup percpu state. We only need to do this for 64-bit
913 * because 32-bit already has %fs set properly.
914 */
915 load_percpu_segment(0);
1647#endif 916#endif
917 /*
918 * The only reliable way to retain the initial address of the
919 * percpu gdt_page is to remember it here, so we can go and
920 * mark it RW later, when the initial percpu area is freed.
921 */
922 xen_initial_gdt = &per_cpu(gdt_page, 0);
1648 923
1649 xen_smp_init(); 924 xen_smp_init();
1650 925
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 2e8271431e1a..5a070900ad35 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -50,6 +50,7 @@ static unsigned long xen_save_fl(void)
50 */ 50 */
51 return (-flags) & X86_EFLAGS_IF; 51 return (-flags) & X86_EFLAGS_IF;
52} 52}
53PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
53 54
54static void xen_restore_fl(unsigned long flags) 55static void xen_restore_fl(unsigned long flags)
55{ 56{
@@ -76,6 +77,7 @@ static void xen_restore_fl(unsigned long flags)
76 xen_force_evtchn_callback(); 77 xen_force_evtchn_callback();
77 } 78 }
78} 79}
80PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
79 81
80static void xen_irq_disable(void) 82static void xen_irq_disable(void)
81{ 83{
@@ -86,6 +88,7 @@ static void xen_irq_disable(void)
86 percpu_read(xen_vcpu)->evtchn_upcall_mask = 1; 88 percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
87 preempt_enable_no_resched(); 89 preempt_enable_no_resched();
88} 90}
91PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
89 92
90static void xen_irq_enable(void) 93static void xen_irq_enable(void)
91{ 94{
@@ -106,6 +109,7 @@ static void xen_irq_enable(void)
106 if (unlikely(vcpu->evtchn_upcall_pending)) 109 if (unlikely(vcpu->evtchn_upcall_pending))
107 xen_force_evtchn_callback(); 110 xen_force_evtchn_callback();
108} 111}
112PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
109 113
110static void xen_safe_halt(void) 114static void xen_safe_halt(void)
111{ 115{
@@ -124,10 +128,12 @@ static void xen_halt(void)
124 128
125static const struct pv_irq_ops xen_irq_ops __initdata = { 129static const struct pv_irq_ops xen_irq_ops __initdata = {
126 .init_IRQ = __xen_init_IRQ, 130 .init_IRQ = __xen_init_IRQ,
127 .save_fl = xen_save_fl, 131
128 .restore_fl = xen_restore_fl, 132 .save_fl = PV_CALLEE_SAVE(xen_save_fl),
129 .irq_disable = xen_irq_disable, 133 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
130 .irq_enable = xen_irq_enable, 134 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
135 .irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
136
131 .safe_halt = xen_safe_halt, 137 .safe_halt = xen_safe_halt,
132 .halt = xen_halt, 138 .halt = xen_halt,
133#ifdef CONFIG_X86_64 139#ifdef CONFIG_X86_64
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 98cb9869eb24..d2e8ed1aff3d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -47,6 +47,7 @@
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/fixmap.h> 48#include <asm/fixmap.h>
49#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
50#include <asm/setup.h>
50#include <asm/paravirt.h> 51#include <asm/paravirt.h>
51#include <asm/linkage.h> 52#include <asm/linkage.h>
52 53
@@ -55,6 +56,8 @@
55 56
56#include <xen/page.h> 57#include <xen/page.h>
57#include <xen/interface/xen.h> 58#include <xen/interface/xen.h>
59#include <xen/interface/version.h>
60#include <xen/hvc-console.h>
58 61
59#include "multicalls.h" 62#include "multicalls.h"
60#include "mmu.h" 63#include "mmu.h"
@@ -114,6 +117,37 @@ static inline void check_zero(void)
114 117
115#endif /* CONFIG_XEN_DEBUG_FS */ 118#endif /* CONFIG_XEN_DEBUG_FS */
116 119
120
121/*
122 * Identity map, in addition to plain kernel map. This needs to be
123 * large enough to allocate page table pages to allocate the rest.
124 * Each page can map 2MB.
125 */
126static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
127
128#ifdef CONFIG_X86_64
129/* l3 pud for userspace vsyscall mapping */
130static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
131#endif /* CONFIG_X86_64 */
132
133/*
134 * Note about cr3 (pagetable base) values:
135 *
136 * xen_cr3 contains the current logical cr3 value; it contains the
137 * last set cr3. This may not be the current effective cr3, because
138 * its update may be being lazily deferred. However, a vcpu looking
139 * at its own cr3 can use this value knowing that it everything will
140 * be self-consistent.
141 *
142 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
143 * hypercall to set the vcpu cr3 is complete (so it may be a little
144 * out of date, but it will never be set early). If one vcpu is
145 * looking at another vcpu's cr3 value, it should use this variable.
146 */
147DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
148DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
149
150
117/* 151/*
118 * Just beyond the highest usermode address. STACK_TOP_MAX has a 152 * Just beyond the highest usermode address. STACK_TOP_MAX has a
119 * redzone above it, so round it up to a PGD boundary. 153 * redzone above it, so round it up to a PGD boundary.
@@ -458,28 +492,33 @@ pteval_t xen_pte_val(pte_t pte)
458{ 492{
459 return pte_mfn_to_pfn(pte.pte); 493 return pte_mfn_to_pfn(pte.pte);
460} 494}
495PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
461 496
462pgdval_t xen_pgd_val(pgd_t pgd) 497pgdval_t xen_pgd_val(pgd_t pgd)
463{ 498{
464 return pte_mfn_to_pfn(pgd.pgd); 499 return pte_mfn_to_pfn(pgd.pgd);
465} 500}
501PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
466 502
467pte_t xen_make_pte(pteval_t pte) 503pte_t xen_make_pte(pteval_t pte)
468{ 504{
469 pte = pte_pfn_to_mfn(pte); 505 pte = pte_pfn_to_mfn(pte);
470 return native_make_pte(pte); 506 return native_make_pte(pte);
471} 507}
508PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
472 509
473pgd_t xen_make_pgd(pgdval_t pgd) 510pgd_t xen_make_pgd(pgdval_t pgd)
474{ 511{
475 pgd = pte_pfn_to_mfn(pgd); 512 pgd = pte_pfn_to_mfn(pgd);
476 return native_make_pgd(pgd); 513 return native_make_pgd(pgd);
477} 514}
515PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
478 516
479pmdval_t xen_pmd_val(pmd_t pmd) 517pmdval_t xen_pmd_val(pmd_t pmd)
480{ 518{
481 return pte_mfn_to_pfn(pmd.pmd); 519 return pte_mfn_to_pfn(pmd.pmd);
482} 520}
521PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
483 522
484void xen_set_pud_hyper(pud_t *ptr, pud_t val) 523void xen_set_pud_hyper(pud_t *ptr, pud_t val)
485{ 524{
@@ -556,12 +595,14 @@ pmd_t xen_make_pmd(pmdval_t pmd)
556 pmd = pte_pfn_to_mfn(pmd); 595 pmd = pte_pfn_to_mfn(pmd);
557 return native_make_pmd(pmd); 596 return native_make_pmd(pmd);
558} 597}
598PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
559 599
560#if PAGETABLE_LEVELS == 4 600#if PAGETABLE_LEVELS == 4
561pudval_t xen_pud_val(pud_t pud) 601pudval_t xen_pud_val(pud_t pud)
562{ 602{
563 return pte_mfn_to_pfn(pud.pud); 603 return pte_mfn_to_pfn(pud.pud);
564} 604}
605PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
565 606
566pud_t xen_make_pud(pudval_t pud) 607pud_t xen_make_pud(pudval_t pud)
567{ 608{
@@ -569,6 +610,7 @@ pud_t xen_make_pud(pudval_t pud)
569 610
570 return native_make_pud(pud); 611 return native_make_pud(pud);
571} 612}
613PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
572 614
573pgd_t *xen_get_user_pgd(pgd_t *pgd) 615pgd_t *xen_get_user_pgd(pgd_t *pgd)
574{ 616{
@@ -1152,6 +1194,709 @@ void xen_exit_mmap(struct mm_struct *mm)
1152 spin_unlock(&mm->page_table_lock); 1194 spin_unlock(&mm->page_table_lock);
1153} 1195}
1154 1196
1197static __init void xen_pagetable_setup_start(pgd_t *base)
1198{
1199}
1200
1201static __init void xen_pagetable_setup_done(pgd_t *base)
1202{
1203 xen_setup_shared_info();
1204}
1205
1206static void xen_write_cr2(unsigned long cr2)
1207{
1208 percpu_read(xen_vcpu)->arch.cr2 = cr2;
1209}
1210
1211static unsigned long xen_read_cr2(void)
1212{
1213 return percpu_read(xen_vcpu)->arch.cr2;
1214}
1215
1216unsigned long xen_read_cr2_direct(void)
1217{
1218 return percpu_read(xen_vcpu_info.arch.cr2);
1219}
1220
1221static void xen_flush_tlb(void)
1222{
1223 struct mmuext_op *op;
1224 struct multicall_space mcs;
1225
1226 preempt_disable();
1227
1228 mcs = xen_mc_entry(sizeof(*op));
1229
1230 op = mcs.args;
1231 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1232 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1233
1234 xen_mc_issue(PARAVIRT_LAZY_MMU);
1235
1236 preempt_enable();
1237}
1238
1239static void xen_flush_tlb_single(unsigned long addr)
1240{
1241 struct mmuext_op *op;
1242 struct multicall_space mcs;
1243
1244 preempt_disable();
1245
1246 mcs = xen_mc_entry(sizeof(*op));
1247 op = mcs.args;
1248 op->cmd = MMUEXT_INVLPG_LOCAL;
1249 op->arg1.linear_addr = addr & PAGE_MASK;
1250 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1251
1252 xen_mc_issue(PARAVIRT_LAZY_MMU);
1253
1254 preempt_enable();
1255}
1256
1257static void xen_flush_tlb_others(const struct cpumask *cpus,
1258 struct mm_struct *mm, unsigned long va)
1259{
1260 struct {
1261 struct mmuext_op op;
1262 DECLARE_BITMAP(mask, NR_CPUS);
1263 } *args;
1264 struct multicall_space mcs;
1265
1266 BUG_ON(cpumask_empty(cpus));
1267 BUG_ON(!mm);
1268
1269 mcs = xen_mc_entry(sizeof(*args));
1270 args = mcs.args;
1271 args->op.arg2.vcpumask = to_cpumask(args->mask);
1272
1273 /* Remove us, and any offline CPUS. */
1274 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1275 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1276 if (unlikely(cpumask_empty(to_cpumask(args->mask))))
1277 goto issue;
1278
1279 if (va == TLB_FLUSH_ALL) {
1280 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1281 } else {
1282 args->op.cmd = MMUEXT_INVLPG_MULTI;
1283 args->op.arg1.linear_addr = va;
1284 }
1285
1286 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1287
1288issue:
1289 xen_mc_issue(PARAVIRT_LAZY_MMU);
1290}
1291
1292static unsigned long xen_read_cr3(void)
1293{
1294 return percpu_read(xen_cr3);
1295}
1296
1297static void set_current_cr3(void *v)
1298{
1299 percpu_write(xen_current_cr3, (unsigned long)v);
1300}
1301
1302static void __xen_write_cr3(bool kernel, unsigned long cr3)
1303{
1304 struct mmuext_op *op;
1305 struct multicall_space mcs;
1306 unsigned long mfn;
1307
1308 if (cr3)
1309 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1310 else
1311 mfn = 0;
1312
1313 WARN_ON(mfn == 0 && kernel);
1314
1315 mcs = __xen_mc_entry(sizeof(*op));
1316
1317 op = mcs.args;
1318 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1319 op->arg1.mfn = mfn;
1320
1321 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1322
1323 if (kernel) {
1324 percpu_write(xen_cr3, cr3);
1325
1326 /* Update xen_current_cr3 once the batch has actually
1327 been submitted. */
1328 xen_mc_callback(set_current_cr3, (void *)cr3);
1329 }
1330}
1331
1332static void xen_write_cr3(unsigned long cr3)
1333{
1334 BUG_ON(preemptible());
1335
1336 xen_mc_batch(); /* disables interrupts */
1337
1338 /* Update while interrupts are disabled, so its atomic with
1339 respect to ipis */
1340 percpu_write(xen_cr3, cr3);
1341
1342 __xen_write_cr3(true, cr3);
1343
1344#ifdef CONFIG_X86_64
1345 {
1346 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1347 if (user_pgd)
1348 __xen_write_cr3(false, __pa(user_pgd));
1349 else
1350 __xen_write_cr3(false, 0);
1351 }
1352#endif
1353
1354 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1355}
1356
1357static int xen_pgd_alloc(struct mm_struct *mm)
1358{
1359 pgd_t *pgd = mm->pgd;
1360 int ret = 0;
1361
1362 BUG_ON(PagePinned(virt_to_page(pgd)));
1363
1364#ifdef CONFIG_X86_64
1365 {
1366 struct page *page = virt_to_page(pgd);
1367 pgd_t *user_pgd;
1368
1369 BUG_ON(page->private != 0);
1370
1371 ret = -ENOMEM;
1372
1373 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1374 page->private = (unsigned long)user_pgd;
1375
1376 if (user_pgd != NULL) {
1377 user_pgd[pgd_index(VSYSCALL_START)] =
1378 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1379 ret = 0;
1380 }
1381
1382 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1383 }
1384#endif
1385
1386 return ret;
1387}
1388
1389static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1390{
1391#ifdef CONFIG_X86_64
1392 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1393
1394 if (user_pgd)
1395 free_page((unsigned long)user_pgd);
1396#endif
1397}
1398
1399#ifdef CONFIG_HIGHPTE
1400static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1401{
1402 pgprot_t prot = PAGE_KERNEL;
1403
1404 if (PagePinned(page))
1405 prot = PAGE_KERNEL_RO;
1406
1407 if (0 && PageHighMem(page))
1408 printk("mapping highpte %lx type %d prot %s\n",
1409 page_to_pfn(page), type,
1410 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1411
1412 return kmap_atomic_prot(page, type, prot);
1413}
1414#endif
1415
1416#ifdef CONFIG_X86_32
1417static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1418{
1419 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1420 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1421 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1422 pte_val_ma(pte));
1423
1424 return pte;
1425}
1426
1427/* Init-time set_pte while constructing initial pagetables, which
1428 doesn't allow RO pagetable pages to be remapped RW */
1429static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1430{
1431 pte = mask_rw_pte(ptep, pte);
1432
1433 xen_set_pte(ptep, pte);
1434}
1435#endif
1436
1437/* Early in boot, while setting up the initial pagetable, assume
1438 everything is pinned. */
1439static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1440{
1441#ifdef CONFIG_FLATMEM
1442 BUG_ON(mem_map); /* should only be used early */
1443#endif
1444 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1445}
1446
1447/* Early release_pte assumes that all pts are pinned, since there's
1448 only init_mm and anything attached to that is pinned. */
1449static void xen_release_pte_init(unsigned long pfn)
1450{
1451 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1452}
1453
1454static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1455{
1456 struct mmuext_op op;
1457 op.cmd = cmd;
1458 op.arg1.mfn = pfn_to_mfn(pfn);
1459 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1460 BUG();
1461}
1462
1463/* This needs to make sure the new pte page is pinned iff its being
1464 attached to a pinned pagetable. */
1465static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1466{
1467 struct page *page = pfn_to_page(pfn);
1468
1469 if (PagePinned(virt_to_page(mm->pgd))) {
1470 SetPagePinned(page);
1471
1472 vm_unmap_aliases();
1473 if (!PageHighMem(page)) {
1474 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1475 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1476 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1477 } else {
1478 /* make sure there are no stray mappings of
1479 this page */
1480 kmap_flush_unused();
1481 }
1482 }
1483}
1484
1485static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1486{
1487 xen_alloc_ptpage(mm, pfn, PT_PTE);
1488}
1489
1490static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1491{
1492 xen_alloc_ptpage(mm, pfn, PT_PMD);
1493}
1494
1495/* This should never happen until we're OK to use struct page */
1496static void xen_release_ptpage(unsigned long pfn, unsigned level)
1497{
1498 struct page *page = pfn_to_page(pfn);
1499
1500 if (PagePinned(page)) {
1501 if (!PageHighMem(page)) {
1502 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1503 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1504 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1505 }
1506 ClearPagePinned(page);
1507 }
1508}
1509
1510static void xen_release_pte(unsigned long pfn)
1511{
1512 xen_release_ptpage(pfn, PT_PTE);
1513}
1514
1515static void xen_release_pmd(unsigned long pfn)
1516{
1517 xen_release_ptpage(pfn, PT_PMD);
1518}
1519
1520#if PAGETABLE_LEVELS == 4
1521static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1522{
1523 xen_alloc_ptpage(mm, pfn, PT_PUD);
1524}
1525
1526static void xen_release_pud(unsigned long pfn)
1527{
1528 xen_release_ptpage(pfn, PT_PUD);
1529}
1530#endif
1531
1532void __init xen_reserve_top(void)
1533{
1534#ifdef CONFIG_X86_32
1535 unsigned long top = HYPERVISOR_VIRT_START;
1536 struct xen_platform_parameters pp;
1537
1538 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1539 top = pp.virt_start;
1540
1541 reserve_top_address(-top);
1542#endif /* CONFIG_X86_32 */
1543}
1544
1545/*
1546 * Like __va(), but returns address in the kernel mapping (which is
1547 * all we have until the physical memory mapping has been set up.
1548 */
1549static void *__ka(phys_addr_t paddr)
1550{
1551#ifdef CONFIG_X86_64
1552 return (void *)(paddr + __START_KERNEL_map);
1553#else
1554 return __va(paddr);
1555#endif
1556}
1557
1558/* Convert a machine address to physical address */
1559static unsigned long m2p(phys_addr_t maddr)
1560{
1561 phys_addr_t paddr;
1562
1563 maddr &= PTE_PFN_MASK;
1564 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1565
1566 return paddr;
1567}
1568
1569/* Convert a machine address to kernel virtual */
1570static void *m2v(phys_addr_t maddr)
1571{
1572 return __ka(m2p(maddr));
1573}
1574
1575static void set_page_prot(void *addr, pgprot_t prot)
1576{
1577 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1578 pte_t pte = pfn_pte(pfn, prot);
1579
1580 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1581 BUG();
1582}
1583
1584static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1585{
1586 unsigned pmdidx, pteidx;
1587 unsigned ident_pte;
1588 unsigned long pfn;
1589
1590 ident_pte = 0;
1591 pfn = 0;
1592 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1593 pte_t *pte_page;
1594
1595 /* Reuse or allocate a page of ptes */
1596 if (pmd_present(pmd[pmdidx]))
1597 pte_page = m2v(pmd[pmdidx].pmd);
1598 else {
1599 /* Check for free pte pages */
1600 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1601 break;
1602
1603 pte_page = &level1_ident_pgt[ident_pte];
1604 ident_pte += PTRS_PER_PTE;
1605
1606 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1607 }
1608
1609 /* Install mappings */
1610 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1611 pte_t pte;
1612
1613 if (pfn > max_pfn_mapped)
1614 max_pfn_mapped = pfn;
1615
1616 if (!pte_none(pte_page[pteidx]))
1617 continue;
1618
1619 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1620 pte_page[pteidx] = pte;
1621 }
1622 }
1623
1624 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1625 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1626
1627 set_page_prot(pmd, PAGE_KERNEL_RO);
1628}
1629
1630#ifdef CONFIG_X86_64
1631static void convert_pfn_mfn(void *v)
1632{
1633 pte_t *pte = v;
1634 int i;
1635
1636 /* All levels are converted the same way, so just treat them
1637 as ptes. */
1638 for (i = 0; i < PTRS_PER_PTE; i++)
1639 pte[i] = xen_make_pte(pte[i].pte);
1640}
1641
1642/*
1643 * Set up the inital kernel pagetable.
1644 *
1645 * We can construct this by grafting the Xen provided pagetable into
1646 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1647 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1648 * means that only the kernel has a physical mapping to start with -
1649 * but that's enough to get __va working. We need to fill in the rest
1650 * of the physical mapping once some sort of allocator has been set
1651 * up.
1652 */
1653__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1654 unsigned long max_pfn)
1655{
1656 pud_t *l3;
1657 pmd_t *l2;
1658
1659 /* Zap identity mapping */
1660 init_level4_pgt[0] = __pgd(0);
1661
1662 /* Pre-constructed entries are in pfn, so convert to mfn */
1663 convert_pfn_mfn(init_level4_pgt);
1664 convert_pfn_mfn(level3_ident_pgt);
1665 convert_pfn_mfn(level3_kernel_pgt);
1666
1667 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1668 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1669
1670 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1671 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1672
1673 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1674 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1675 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1676
1677 /* Set up identity map */
1678 xen_map_identity_early(level2_ident_pgt, max_pfn);
1679
1680 /* Make pagetable pieces RO */
1681 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1682 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1683 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1684 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1685 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1686 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1687
1688 /* Pin down new L4 */
1689 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1690 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1691
1692 /* Unpin Xen-provided one */
1693 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1694
1695 /* Switch over */
1696 pgd = init_level4_pgt;
1697
1698 /*
1699 * At this stage there can be no user pgd, and no page
1700 * structure to attach it to, so make sure we just set kernel
1701 * pgd.
1702 */
1703 xen_mc_batch();
1704 __xen_write_cr3(true, __pa(pgd));
1705 xen_mc_issue(PARAVIRT_LAZY_CPU);
1706
1707 reserve_early(__pa(xen_start_info->pt_base),
1708 __pa(xen_start_info->pt_base +
1709 xen_start_info->nr_pt_frames * PAGE_SIZE),
1710 "XEN PAGETABLES");
1711
1712 return pgd;
1713}
1714#else /* !CONFIG_X86_64 */
1715static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1716
1717__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1718 unsigned long max_pfn)
1719{
1720 pmd_t *kernel_pmd;
1721
1722 init_pg_tables_start = __pa(pgd);
1723 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1724 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1725
1726 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1727 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1728
1729 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1730
1731 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1732 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1733 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1734
1735 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1736 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1737 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1738
1739 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1740
1741 xen_write_cr3(__pa(swapper_pg_dir));
1742
1743 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1744
1745 return swapper_pg_dir;
1746}
1747#endif /* CONFIG_X86_64 */
1748
1749static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1750{
1751 pte_t pte;
1752
1753 phys >>= PAGE_SHIFT;
1754
1755 switch (idx) {
1756 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1757#ifdef CONFIG_X86_F00F_BUG
1758 case FIX_F00F_IDT:
1759#endif
1760#ifdef CONFIG_X86_32
1761 case FIX_WP_TEST:
1762 case FIX_VDSO:
1763# ifdef CONFIG_HIGHMEM
1764 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1765# endif
1766#else
1767 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1768#endif
1769#ifdef CONFIG_X86_LOCAL_APIC
1770 case FIX_APIC_BASE: /* maps dummy local APIC */
1771#endif
1772 pte = pfn_pte(phys, prot);
1773 break;
1774
1775 default:
1776 pte = mfn_pte(phys, prot);
1777 break;
1778 }
1779
1780 __native_set_fixmap(idx, pte);
1781
1782#ifdef CONFIG_X86_64
1783 /* Replicate changes to map the vsyscall page into the user
1784 pagetable vsyscall mapping. */
1785 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1786 unsigned long vaddr = __fix_to_virt(idx);
1787 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1788 }
1789#endif
1790}
1791
1792__init void xen_post_allocator_init(void)
1793{
1794 pv_mmu_ops.set_pte = xen_set_pte;
1795 pv_mmu_ops.set_pmd = xen_set_pmd;
1796 pv_mmu_ops.set_pud = xen_set_pud;
1797#if PAGETABLE_LEVELS == 4
1798 pv_mmu_ops.set_pgd = xen_set_pgd;
1799#endif
1800
1801 /* This will work as long as patching hasn't happened yet
1802 (which it hasn't) */
1803 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1804 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1805 pv_mmu_ops.release_pte = xen_release_pte;
1806 pv_mmu_ops.release_pmd = xen_release_pmd;
1807#if PAGETABLE_LEVELS == 4
1808 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1809 pv_mmu_ops.release_pud = xen_release_pud;
1810#endif
1811
1812#ifdef CONFIG_X86_64
1813 SetPagePinned(virt_to_page(level3_user_vsyscall));
1814#endif
1815 xen_mark_init_mm_pinned();
1816}
1817
1818
1819const struct pv_mmu_ops xen_mmu_ops __initdata = {
1820 .pagetable_setup_start = xen_pagetable_setup_start,
1821 .pagetable_setup_done = xen_pagetable_setup_done,
1822
1823 .read_cr2 = xen_read_cr2,
1824 .write_cr2 = xen_write_cr2,
1825
1826 .read_cr3 = xen_read_cr3,
1827 .write_cr3 = xen_write_cr3,
1828
1829 .flush_tlb_user = xen_flush_tlb,
1830 .flush_tlb_kernel = xen_flush_tlb,
1831 .flush_tlb_single = xen_flush_tlb_single,
1832 .flush_tlb_others = xen_flush_tlb_others,
1833
1834 .pte_update = paravirt_nop,
1835 .pte_update_defer = paravirt_nop,
1836
1837 .pgd_alloc = xen_pgd_alloc,
1838 .pgd_free = xen_pgd_free,
1839
1840 .alloc_pte = xen_alloc_pte_init,
1841 .release_pte = xen_release_pte_init,
1842 .alloc_pmd = xen_alloc_pte_init,
1843 .alloc_pmd_clone = paravirt_nop,
1844 .release_pmd = xen_release_pte_init,
1845
1846#ifdef CONFIG_HIGHPTE
1847 .kmap_atomic_pte = xen_kmap_atomic_pte,
1848#endif
1849
1850#ifdef CONFIG_X86_64
1851 .set_pte = xen_set_pte,
1852#else
1853 .set_pte = xen_set_pte_init,
1854#endif
1855 .set_pte_at = xen_set_pte_at,
1856 .set_pmd = xen_set_pmd_hyper,
1857
1858 .ptep_modify_prot_start = __ptep_modify_prot_start,
1859 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1860
1861 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
1862 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
1863
1864 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
1865 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
1866
1867#ifdef CONFIG_X86_PAE
1868 .set_pte_atomic = xen_set_pte_atomic,
1869 .set_pte_present = xen_set_pte_at,
1870 .pte_clear = xen_pte_clear,
1871 .pmd_clear = xen_pmd_clear,
1872#endif /* CONFIG_X86_PAE */
1873 .set_pud = xen_set_pud_hyper,
1874
1875 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
1876 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
1877
1878#if PAGETABLE_LEVELS == 4
1879 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
1880 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
1881 .set_pgd = xen_set_pgd_hyper,
1882
1883 .alloc_pud = xen_alloc_pte_init,
1884 .release_pud = xen_release_pte_init,
1885#endif /* PAGETABLE_LEVELS == 4 */
1886
1887 .activate_mm = xen_activate_mm,
1888 .dup_mmap = xen_dup_mmap,
1889 .exit_mmap = xen_exit_mmap,
1890
1891 .lazy_mode = {
1892 .enter = paravirt_enter_lazy_mmu,
1893 .leave = xen_leave_lazy,
1894 },
1895
1896 .set_fixmap = xen_set_fixmap,
1897};
1898
1899
1155#ifdef CONFIG_XEN_DEBUG_FS 1900#ifdef CONFIG_XEN_DEBUG_FS
1156 1901
1157static struct dentry *d_mmu_debug; 1902static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 98d71659da5a..24d1b44a337d 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t
54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
55 pte_t *ptep, pte_t pte); 55 pte_t *ptep, pte_t pte);
56 56
57unsigned long xen_read_cr2_direct(void);
58
59extern const struct pv_mmu_ops xen_mmu_ops;
57#endif /* _XEN_MMU_H */ 60#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 7735e3dd359c..035582ae815d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -170,7 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
170 170
171 /* We've switched to the "real" per-cpu gdt, so make sure the 171 /* We've switched to the "real" per-cpu gdt, so make sure the
172 old memory can be recycled */ 172 old memory can be recycled */
173 make_lowmem_page_readwrite(&per_cpu_var(gdt_page)); 173 make_lowmem_page_readwrite(xen_initial_gdt);
174 174
175 xen_setup_vcpu_info_placement(); 175 xen_setup_vcpu_info_placement();
176} 176}
@@ -235,6 +235,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
235 ctxt->user_regs.ss = __KERNEL_DS; 235 ctxt->user_regs.ss = __KERNEL_DS;
236#ifdef CONFIG_X86_32 236#ifdef CONFIG_X86_32
237 ctxt->user_regs.fs = __KERNEL_PERCPU; 237 ctxt->user_regs.fs = __KERNEL_PERCPU;
238#else
239 ctxt->gs_base_kernel = per_cpu_offset(cpu);
238#endif 240#endif
239 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 241 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
240 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 242 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
@@ -284,6 +286,9 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
284 irq_ctx_init(cpu); 286 irq_ctx_init(cpu);
285#else 287#else
286 clear_tsk_thread_flag(idle, TIF_FORK); 288 clear_tsk_thread_flag(idle, TIF_FORK);
289 per_cpu(kernel_stack, cpu) =
290 (unsigned long)task_stack_page(idle) -
291 KERNEL_STACK_OFFSET + THREAD_SIZE;
287#endif 292#endif
288 xen_setup_timer(cpu); 293 xen_setup_timer(cpu);
289 xen_init_lock_cpu(cpu); 294 xen_init_lock_cpu(cpu);
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
new file mode 100644
index 000000000000..4c6f96799131
--- /dev/null
+++ b/arch/x86/xen/xen-asm.S
@@ -0,0 +1,140 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in percpu data) of
10 the operations here; the indirect forms are better handled in
11 C, since they're generally too large to inline anyway.
12 */
13
14#include <asm/asm-offsets.h>
15#include <asm/percpu.h>
16#include <asm/processor-flags.h>
17
18#include "xen-asm.h"
19
20/*
21 Enable events. This clears the event mask and tests the pending
22 event status with one and operation. If there are pending
23 events, then enter the hypervisor to get them handled.
24 */
25ENTRY(xen_irq_enable_direct)
26 /* Unmask events */
27 movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
28
29 /* Preempt here doesn't matter because that will deal with
30 any pending interrupts. The pending check may end up being
31 run on the wrong CPU, but that doesn't hurt. */
32
33 /* Test for pending */
34 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
35 jz 1f
36
372: call check_events
381:
39ENDPATCH(xen_irq_enable_direct)
40 ret
41 ENDPROC(xen_irq_enable_direct)
42 RELOC(xen_irq_enable_direct, 2b+1)
43
44
45/*
46 Disabling events is simply a matter of making the event mask
47 non-zero.
48 */
49ENTRY(xen_irq_disable_direct)
50 movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
51ENDPATCH(xen_irq_disable_direct)
52 ret
53 ENDPROC(xen_irq_disable_direct)
54 RELOC(xen_irq_disable_direct, 0)
55
56/*
57 (xen_)save_fl is used to get the current interrupt enable status.
58 Callers expect the status to be in X86_EFLAGS_IF, and other bits
59 may be set in the return value. We take advantage of this by
60 making sure that X86_EFLAGS_IF has the right value (and other bits
61 in that byte are 0), but other bits in the return value are
62 undefined. We need to toggle the state of the bit, because
63 Xen and x86 use opposite senses (mask vs enable).
64 */
65ENTRY(xen_save_fl_direct)
66 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
67 setz %ah
68 addb %ah,%ah
69ENDPATCH(xen_save_fl_direct)
70 ret
71 ENDPROC(xen_save_fl_direct)
72 RELOC(xen_save_fl_direct, 0)
73
74
75/*
76 In principle the caller should be passing us a value return
77 from xen_save_fl_direct, but for robustness sake we test only
78 the X86_EFLAGS_IF flag rather than the whole byte. After
79 setting the interrupt mask state, it checks for unmasked
80 pending events and enters the hypervisor to get them delivered
81 if so.
82 */
83ENTRY(xen_restore_fl_direct)
84#ifdef CONFIG_X86_64
85 testw $X86_EFLAGS_IF, %di
86#else
87 testb $X86_EFLAGS_IF>>8, %ah
88#endif
89 setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
90 /* Preempt here doesn't matter because that will deal with
91 any pending interrupts. The pending check may end up being
92 run on the wrong CPU, but that doesn't hurt. */
93
94 /* check for unmasked and pending */
95 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
96 jz 1f
972: call check_events
981:
99ENDPATCH(xen_restore_fl_direct)
100 ret
101 ENDPROC(xen_restore_fl_direct)
102 RELOC(xen_restore_fl_direct, 2b+1)
103
104
105/*
106 Force an event check by making a hypercall,
107 but preserve regs before making the call.
108 */
109check_events:
110#ifdef CONFIG_X86_32
111 push %eax
112 push %ecx
113 push %edx
114 call xen_force_evtchn_callback
115 pop %edx
116 pop %ecx
117 pop %eax
118#else
119 push %rax
120 push %rcx
121 push %rdx
122 push %rsi
123 push %rdi
124 push %r8
125 push %r9
126 push %r10
127 push %r11
128 call xen_force_evtchn_callback
129 pop %r11
130 pop %r10
131 pop %r9
132 pop %r8
133 pop %rdi
134 pop %rsi
135 pop %rdx
136 pop %rcx
137 pop %rax
138#endif
139 ret
140
diff --git a/arch/x86/xen/xen-asm.h b/arch/x86/xen/xen-asm.h
new file mode 100644
index 000000000000..465276467a47
--- /dev/null
+++ b/arch/x86/xen/xen-asm.h
@@ -0,0 +1,12 @@
1#ifndef _XEN_XEN_ASM_H
2#define _XEN_XEN_ASM_H
3
4#include <linux/linkage.h>
5
6#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
7#define ENDPATCH(x) .globl x##_end; x##_end=.
8
9/* Pseudo-flag used for virtual NMI, which we don't implement yet */
10#define XEN_EFLAGS_NMI 0x80000000
11
12#endif
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 42786f59d9c0..082d173caaf3 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -11,101 +11,28 @@
11 generally too large to inline anyway. 11 generally too large to inline anyway.
12 */ 12 */
13 13
14#include <linux/linkage.h> 14//#include <asm/asm-offsets.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/thread_info.h> 15#include <asm/thread_info.h>
18#include <asm/percpu.h>
19#include <asm/processor-flags.h> 16#include <asm/processor-flags.h>
20#include <asm/segment.h> 17#include <asm/segment.h>
21 18
22#include <xen/interface/xen.h> 19#include <xen/interface/xen.h>
23 20
24#define RELOC(x, v) .globl x##_reloc; x##_reloc=v 21#include "xen-asm.h"
25#define ENDPATCH(x) .globl x##_end; x##_end=.
26
27/* Pseudo-flag used for virtual NMI, which we don't implement yet */
28#define XEN_EFLAGS_NMI 0x80000000
29
30/*
31 Enable events. This clears the event mask and tests the pending
32 event status with one and operation. If there are pending
33 events, then enter the hypervisor to get them handled.
34 */
35ENTRY(xen_irq_enable_direct)
36 /* Unmask events */
37 movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
38
39 /* Preempt here doesn't matter because that will deal with
40 any pending interrupts. The pending check may end up being
41 run on the wrong CPU, but that doesn't hurt. */
42
43 /* Test for pending */
44 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
45 jz 1f
46
472: call check_events
481:
49ENDPATCH(xen_irq_enable_direct)
50 ret
51 ENDPROC(xen_irq_enable_direct)
52 RELOC(xen_irq_enable_direct, 2b+1)
53
54 22
55/* 23/*
56 Disabling events is simply a matter of making the event mask 24 Force an event check by making a hypercall,
57 non-zero. 25 but preserve regs before making the call.
58 */
59ENTRY(xen_irq_disable_direct)
60 movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
61ENDPATCH(xen_irq_disable_direct)
62 ret
63 ENDPROC(xen_irq_disable_direct)
64 RELOC(xen_irq_disable_direct, 0)
65
66/*
67 (xen_)save_fl is used to get the current interrupt enable status.
68 Callers expect the status to be in X86_EFLAGS_IF, and other bits
69 may be set in the return value. We take advantage of this by
70 making sure that X86_EFLAGS_IF has the right value (and other bits
71 in that byte are 0), but other bits in the return value are
72 undefined. We need to toggle the state of the bit, because
73 Xen and x86 use opposite senses (mask vs enable).
74 */
75ENTRY(xen_save_fl_direct)
76 testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
77 setz %ah
78 addb %ah,%ah
79ENDPATCH(xen_save_fl_direct)
80 ret
81 ENDPROC(xen_save_fl_direct)
82 RELOC(xen_save_fl_direct, 0)
83
84
85/*
86 In principle the caller should be passing us a value return
87 from xen_save_fl_direct, but for robustness sake we test only
88 the X86_EFLAGS_IF flag rather than the whole byte. After
89 setting the interrupt mask state, it checks for unmasked
90 pending events and enters the hypervisor to get them delivered
91 if so.
92 */ 26 */
93ENTRY(xen_restore_fl_direct) 27check_events:
94 testb $X86_EFLAGS_IF>>8, %ah 28 push %eax
95 setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask 29 push %ecx
96 /* Preempt here doesn't matter because that will deal with 30 push %edx
97 any pending interrupts. The pending check may end up being 31 call xen_force_evtchn_callback
98 run on the wrong CPU, but that doesn't hurt. */ 32 pop %edx
99 33 pop %ecx
100 /* check for unmasked and pending */ 34 pop %eax
101 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
102 jz 1f
1032: call check_events
1041:
105ENDPATCH(xen_restore_fl_direct)
106 ret 35 ret
107 ENDPROC(xen_restore_fl_direct)
108 RELOC(xen_restore_fl_direct, 2b+1)
109 36
110/* 37/*
111 We can't use sysexit directly, because we're not running in ring0. 38 We can't use sysexit directly, because we're not running in ring0.
@@ -289,17 +216,3 @@ ENTRY(xen_iret_crit_fixup)
289 lea 4(%edi),%esp /* point esp to new frame */ 216 lea 4(%edi),%esp /* point esp to new frame */
2902: jmp xen_do_upcall 2172: jmp xen_do_upcall
291 218
292
293/*
294 Force an event check by making a hypercall,
295 but preserve regs before making the call.
296 */
297check_events:
298 push %eax
299 push %ecx
300 push %edx
301 call xen_force_evtchn_callback
302 pop %edx
303 pop %ecx
304 pop %eax
305 ret
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index d6fc51f4ce85..d205a283efe0 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -11,142 +11,14 @@
11 generally too large to inline anyway. 11 generally too large to inline anyway.
12 */ 12 */
13 13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h> 14#include <asm/errno.h>
19#include <asm/segment.h>
20#include <asm/percpu.h> 15#include <asm/percpu.h>
16#include <asm/processor-flags.h>
17#include <asm/segment.h>
21 18
22#include <xen/interface/xen.h> 19#include <xen/interface/xen.h>
23 20
24#define RELOC(x, v) .globl x##_reloc; x##_reloc=v 21#include "xen-asm.h"
25#define ENDPATCH(x) .globl x##_end; x##_end=.
26
27/* Pseudo-flag used for virtual NMI, which we don't implement yet */
28#define XEN_EFLAGS_NMI 0x80000000
29
30#if 1
31/*
32 FIXME: x86_64 now can support direct access to percpu variables
33 via a segment override. Update xen accordingly.
34 */
35#define BUG ud2a
36#endif
37
38/*
39 Enable events. This clears the event mask and tests the pending
40 event status with one and operation. If there are pending
41 events, then enter the hypervisor to get them handled.
42 */
43ENTRY(xen_irq_enable_direct)
44 BUG
45
46 /* Unmask events */
47 movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
48
49 /* Preempt here doesn't matter because that will deal with
50 any pending interrupts. The pending check may end up being
51 run on the wrong CPU, but that doesn't hurt. */
52
53 /* Test for pending */
54 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
55 jz 1f
56
572: call check_events
581:
59ENDPATCH(xen_irq_enable_direct)
60 ret
61 ENDPROC(xen_irq_enable_direct)
62 RELOC(xen_irq_enable_direct, 2b+1)
63
64/*
65 Disabling events is simply a matter of making the event mask
66 non-zero.
67 */
68ENTRY(xen_irq_disable_direct)
69 BUG
70
71 movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
72ENDPATCH(xen_irq_disable_direct)
73 ret
74 ENDPROC(xen_irq_disable_direct)
75 RELOC(xen_irq_disable_direct, 0)
76
77/*
78 (xen_)save_fl is used to get the current interrupt enable status.
79 Callers expect the status to be in X86_EFLAGS_IF, and other bits
80 may be set in the return value. We take advantage of this by
81 making sure that X86_EFLAGS_IF has the right value (and other bits
82 in that byte are 0), but other bits in the return value are
83 undefined. We need to toggle the state of the bit, because
84 Xen and x86 use opposite senses (mask vs enable).
85 */
86ENTRY(xen_save_fl_direct)
87 BUG
88
89 testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
90 setz %ah
91 addb %ah,%ah
92ENDPATCH(xen_save_fl_direct)
93 ret
94 ENDPROC(xen_save_fl_direct)
95 RELOC(xen_save_fl_direct, 0)
96
97/*
98 In principle the caller should be passing us a value return
99 from xen_save_fl_direct, but for robustness sake we test only
100 the X86_EFLAGS_IF flag rather than the whole byte. After
101 setting the interrupt mask state, it checks for unmasked
102 pending events and enters the hypervisor to get them delivered
103 if so.
104 */
105ENTRY(xen_restore_fl_direct)
106 BUG
107
108 testb $X86_EFLAGS_IF>>8, %ah
109 setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
110 /* Preempt here doesn't matter because that will deal with
111 any pending interrupts. The pending check may end up being
112 run on the wrong CPU, but that doesn't hurt. */
113
114 /* check for unmasked and pending */
115 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
116 jz 1f
1172: call check_events
1181:
119ENDPATCH(xen_restore_fl_direct)
120 ret
121 ENDPROC(xen_restore_fl_direct)
122 RELOC(xen_restore_fl_direct, 2b+1)
123
124
125/*
126 Force an event check by making a hypercall,
127 but preserve regs before making the call.
128 */
129check_events:
130 push %rax
131 push %rcx
132 push %rdx
133 push %rsi
134 push %rdi
135 push %r8
136 push %r9
137 push %r10
138 push %r11
139 call xen_force_evtchn_callback
140 pop %r11
141 pop %r10
142 pop %r9
143 pop %r8
144 pop %rdi
145 pop %rsi
146 pop %rdx
147 pop %rcx
148 pop %rax
149 ret
150 22
151ENTRY(xen_adjust_exception_frame) 23ENTRY(xen_adjust_exception_frame)
152 mov 8+0(%rsp),%rcx 24 mov 8+0(%rsp),%rcx
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c1f8faf0a2c5..2f5ef2632ea2 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,9 +10,12 @@
10extern const char xen_hypervisor_callback[]; 10extern const char xen_hypervisor_callback[];
11extern const char xen_failsafe_callback[]; 11extern const char xen_failsafe_callback[];
12 12
13extern void *xen_initial_gdt;
14
13struct trap_info; 15struct trap_info;
14void xen_copy_trap_info(struct trap_info *traps); 16void xen_copy_trap_info(struct trap_info *traps);
15 17
18DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
16DECLARE_PER_CPU(unsigned long, xen_cr3); 19DECLARE_PER_CPU(unsigned long, xen_cr3);
17DECLARE_PER_CPU(unsigned long, xen_current_cr3); 20DECLARE_PER_CPU(unsigned long, xen_current_cr3);
18 21
@@ -22,6 +25,13 @@ extern struct shared_info *HYPERVISOR_shared_info;
22 25
23void xen_setup_mfn_list_list(void); 26void xen_setup_mfn_list_list(void);
24void xen_setup_shared_info(void); 27void xen_setup_shared_info(void);
28void xen_setup_machphys_mapping(void);
29pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void);
31void xen_reserve_top(void);
32
33void xen_leave_lazy(void);
34void xen_post_allocator_init(void);
25 35
26char * __init xen_memory_setup(void); 36char * __init xen_memory_setup(void);
27void __init xen_arch_setup(void); 37void __init xen_arch_setup(void);
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f3180a85c66a..5406e70aba86 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -445,24 +445,22 @@
445 * section in the linker script will go there too. @phdr should have 445 * section in the linker script will go there too. @phdr should have
446 * a leading colon. 446 * a leading colon.
447 * 447 *
448 * This macro defines three symbols, __per_cpu_load, __per_cpu_start 448 * Note that this macros defines __per_cpu_load as an absolute symbol.
449 * and __per_cpu_end. The first one is the vaddr of loaded percpu 449 * If there is no need to put the percpu section at a predetermined
450 * init data. __per_cpu_start equals @vaddr and __per_cpu_end is the 450 * address, use PERCPU().
451 * end offset.
452 */ 451 */
453#define PERCPU_VADDR(vaddr, phdr) \ 452#define PERCPU_VADDR(vaddr, phdr) \
454 VMLINUX_SYMBOL(__per_cpu_load_abs) = .; \ 453 VMLINUX_SYMBOL(__per_cpu_load) = .; \
455 .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load_abs) \ 454 .data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load) \
456 - LOAD_OFFSET) { \ 455 - LOAD_OFFSET) { \
457 VMLINUX_SYMBOL(__per_cpu_start) = .; \ 456 VMLINUX_SYMBOL(__per_cpu_start) = .; \
458 VMLINUX_SYMBOL(__per_cpu_load) = LOADADDR(.data.percpu) + LOAD_OFFSET;\
459 *(.data.percpu.first) \ 457 *(.data.percpu.first) \
460 *(.data.percpu.page_aligned) \ 458 *(.data.percpu.page_aligned) \
461 *(.data.percpu) \ 459 *(.data.percpu) \
462 *(.data.percpu.shared_aligned) \ 460 *(.data.percpu.shared_aligned) \
463 VMLINUX_SYMBOL(__per_cpu_end) = .; \ 461 VMLINUX_SYMBOL(__per_cpu_end) = .; \
464 } phdr \ 462 } phdr \
465 . = VMLINUX_SYMBOL(__per_cpu_load_abs) + SIZEOF(.data.percpu); 463 . = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);
466 464
467/** 465/**
468 * PERCPU - define output section for percpu area, simple version 466 * PERCPU - define output section for percpu area, simple version
@@ -471,7 +469,20 @@
471 * Align to @align and outputs output section for percpu area. This 469 * Align to @align and outputs output section for percpu area. This
472 * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and 470 * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and
473 * __per_cpu_start will be identical. 471 * __per_cpu_start will be identical.
472 *
473 * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except
474 * that __per_cpu_load is defined as a relative symbol against
475 * .data.percpu which is required for relocatable x86_32
476 * configuration.
474 */ 477 */
475#define PERCPU(align) \ 478#define PERCPU(align) \
476 . = ALIGN(align); \ 479 . = ALIGN(align); \
477 PERCPU_VADDR( , ) 480 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \
481 VMLINUX_SYMBOL(__per_cpu_load) = .; \
482 VMLINUX_SYMBOL(__per_cpu_start) = .; \
483 *(.data.percpu.first) \
484 *(.data.percpu.page_aligned) \
485 *(.data.percpu) \
486 *(.data.percpu.shared_aligned) \
487 VMLINUX_SYMBOL(__per_cpu_end) = .; \
488 }