summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-08 14:46:48 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-08 14:46:48 -0400
commit752240e74d650faa24425adc523f1308973ea51c (patch)
tree47657b7d468352424f844156883302653252f70e /arch
parentb8cb642af98216fe6eeca1525345b8a5c9d7c9a4 (diff)
parent626d7508664c4bc8e67f496da4387ecd0c410b8c (diff)
Merge tag 'for-linus-4.3-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
Pull xen updates from David Vrabel: "Xen features and fixes for 4.3: - Convert xen-blkfront to the multiqueue API - [arm] Support binding event channels to different VCPUs. - [x86] Support > 512 GiB in a PV guests (off by default as such a guest cannot be migrated with the current toolstack). - [x86] PMU support for PV dom0 (limited support for using perf with Xen and other guests)" * tag 'for-linus-4.3-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (33 commits) xen: switch extra memory accounting to use pfns xen: limit memory to architectural maximum xen: avoid another early crash of memory limited dom0 xen: avoid early crash of memory limited dom0 arm/xen: Remove helpers which are PV specific xen/x86: Don't try to set PCE bit in CR4 xen/PMU: PMU emulation code xen/PMU: Intercept PMU-related MSR and APIC accesses xen/PMU: Describe vendor-specific PMU registers xen/PMU: Initialization code for Xen PMU xen/PMU: Sysfs interface for setting Xen PMU mode xen: xensyms support xen: remove no longer needed p2m.h xen: allow more than 512 GB of RAM for 64 bit pv-domains xen: move p2m list if conflicting with e820 map xen: add explicit memblock_reserve() calls for special pages mm: provide early_memremap_ro to establish read-only mapping xen: check for initrd conflicting with e820 map xen: check pre-allocated page tables for conflict with memory map xen: check for kernel memory conflicting with memory layout ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/include/asm/xen/events.h6
-rw-r--r--arch/arm/include/asm/xen/page.h16
-rw-r--r--arch/arm/xen/enlighten.c7
-rw-r--r--arch/arm64/include/asm/xen/events.h6
-rw-r--r--arch/x86/include/asm/xen/events.h11
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/asm/xen/interface.h219
-rw-r--r--arch/x86/include/asm/xen/page.h8
-rw-r--r--arch/x86/xen/Kconfig21
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/apic.c6
-rw-r--r--arch/x86/xen/enlighten.c20
-rw-r--r--arch/x86/xen/mmu.c399
-rw-r--r--arch/x86/xen/p2m.c43
-rw-r--r--arch/x86/xen/p2m.h15
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/pmu.c570
-rw-r--r--arch/x86/xen/pmu.h15
-rw-r--r--arch/x86/xen/setup.c496
-rw-r--r--arch/x86/xen/smp.c29
-rw-r--r--arch/x86/xen/suspend.c23
-rw-r--r--arch/x86/xen/xen-head.S2
-rw-r--r--arch/x86/xen/xen-ops.h7
23 files changed, 1679 insertions, 250 deletions
diff --git a/arch/arm/include/asm/xen/events.h b/arch/arm/include/asm/xen/events.h
index 8b1f37bfeeec..71e473d05fcc 100644
--- a/arch/arm/include/asm/xen/events.h
+++ b/arch/arm/include/asm/xen/events.h
@@ -20,4 +20,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
20 atomic64_t, \ 20 atomic64_t, \
21 counter), (val)) 21 counter), (val))
22 22
23/* Rebind event channel is supported by default */
24static inline bool xen_support_evtchn_rebind(void)
25{
26 return true;
27}
28
23#endif /* _ASM_ARM_XEN_EVENTS_H */ 29#endif /* _ASM_ARM_XEN_EVENTS_H */
diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h
index 1bee8ca12494..98b1084f8282 100644
--- a/arch/arm/include/asm/xen/page.h
+++ b/arch/arm/include/asm/xen/page.h
@@ -54,26 +54,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
54 54
55#define mfn_to_local_pfn(mfn) mfn_to_pfn(mfn) 55#define mfn_to_local_pfn(mfn) mfn_to_pfn(mfn)
56 56
57static inline xmaddr_t phys_to_machine(xpaddr_t phys)
58{
59 unsigned offset = phys.paddr & ~PAGE_MASK;
60 return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
61}
62
63static inline xpaddr_t machine_to_phys(xmaddr_t machine)
64{
65 unsigned offset = machine.maddr & ~PAGE_MASK;
66 return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
67}
68/* VIRT <-> MACHINE conversion */ 57/* VIRT <-> MACHINE conversion */
69#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
70#define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) 58#define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v)))
71#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) 59#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
72 60
61/* Only used in PV code. But ARM guests are always HVM. */
73static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr) 62static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr)
74{ 63{
75 /* TODO: assuming it is mapped in the kernel 1:1 */ 64 BUG();
76 return virt_to_machine(vaddr);
77} 65}
78 66
79/* TODO: this shouldn't be here but it is because the frontend drivers 67/* TODO: this shouldn't be here but it is because the frontend drivers
diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index 6c09cc440a2b..c50c8d33f874 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -45,13 +45,6 @@ static struct vcpu_info __percpu *xen_vcpu_info;
45unsigned long xen_released_pages; 45unsigned long xen_released_pages;
46struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; 46struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
47 47
48/* TODO: to be removed */
49__read_mostly int xen_have_vector_callback;
50EXPORT_SYMBOL_GPL(xen_have_vector_callback);
51
52int xen_platform_pci_unplug = XEN_UNPLUG_ALL;
53EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
54
55static __read_mostly unsigned int xen_events_irq; 48static __read_mostly unsigned int xen_events_irq;
56 49
57static __initdata struct device_node *xen_node; 50static __initdata struct device_node *xen_node;
diff --git a/arch/arm64/include/asm/xen/events.h b/arch/arm64/include/asm/xen/events.h
index 86553213c132..4318866d053c 100644
--- a/arch/arm64/include/asm/xen/events.h
+++ b/arch/arm64/include/asm/xen/events.h
@@ -18,4 +18,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
18 18
19#define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) 19#define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
20 20
21/* Rebind event channel is supported by default */
22static inline bool xen_support_evtchn_rebind(void)
23{
24 return true;
25}
26
21#endif /* _ASM_ARM64_XEN_EVENTS_H */ 27#endif /* _ASM_ARM64_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index 608a79d5a466..e6911caf5bbf 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
20/* No need for a barrier -- XCHG is a barrier on x86. */ 20/* No need for a barrier -- XCHG is a barrier on x86. */
21#define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) 21#define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
22 22
23extern int xen_have_vector_callback;
24
25/*
26 * Events delivered via platform PCI interrupts are always
27 * routed to vcpu 0 and hence cannot be rebound.
28 */
29static inline bool xen_support_evtchn_rebind(void)
30{
31 return (!xen_hvm_domain() || xen_have_vector_callback);
32}
33
23#endif /* _ASM_X86_XEN_EVENTS_H */ 34#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index ca08a27b90b3..83aea8055119 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -465,6 +465,12 @@ HYPERVISOR_tmem_op(
465 return _hypercall1(int, tmem_op, op); 465 return _hypercall1(int, tmem_op, op);
466} 466}
467 467
468static inline int
469HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
470{
471 return _hypercall2(int, xenpmu_op, op, arg);
472}
473
468static inline void 474static inline void
469MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) 475MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
470{ 476{
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 3400dbaec3c3..62ca03ef5c65 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -3,12 +3,38 @@
3 * 3 *
4 * Guest OS interface to x86 Xen. 4 * Guest OS interface to x86 Xen.
5 * 5 *
6 * Copyright (c) 2004, K A Fraser 6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
23 *
24 * Copyright (c) 2004-2006, K A Fraser
7 */ 25 */
8 26
9#ifndef _ASM_X86_XEN_INTERFACE_H 27#ifndef _ASM_X86_XEN_INTERFACE_H
10#define _ASM_X86_XEN_INTERFACE_H 28#define _ASM_X86_XEN_INTERFACE_H
11 29
30/*
31 * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
32 * in a struct in memory.
33 * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an
34 * hypercall argument.
35 * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but
36 * they might not be on other architectures.
37 */
12#ifdef __XEN__ 38#ifdef __XEN__
13#define __DEFINE_GUEST_HANDLE(name, type) \ 39#define __DEFINE_GUEST_HANDLE(name, type) \
14 typedef struct { type *p; } __guest_handle_ ## name 40 typedef struct { type *p; } __guest_handle_ ## name
@@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t);
88 * start of the GDT because some stupid OSes export hard-coded selector values 114 * start of the GDT because some stupid OSes export hard-coded selector values
89 * in their ABI. These hard-coded values are always near the start of the GDT, 115 * in their ABI. These hard-coded values are always near the start of the GDT,
90 * so Xen places itself out of the way, at the far end of the GDT. 116 * so Xen places itself out of the way, at the far end of the GDT.
117 *
118 * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op
91 */ 119 */
92#define FIRST_RESERVED_GDT_PAGE 14 120#define FIRST_RESERVED_GDT_PAGE 14
93#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) 121#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
94#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) 122#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
95 123
96/* 124/*
97 * Send an array of these to HYPERVISOR_set_trap_table() 125 * Send an array of these to HYPERVISOR_set_trap_table().
126 * Terminate the array with a sentinel entry, with traps[].address==0.
98 * The privilege level specifies which modes may enter a trap via a software 127 * The privilege level specifies which modes may enter a trap via a software
99 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate 128 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
100 * privilege levels as follows: 129 * privilege levels as follows:
@@ -118,10 +147,41 @@ struct trap_info {
118DEFINE_GUEST_HANDLE_STRUCT(trap_info); 147DEFINE_GUEST_HANDLE_STRUCT(trap_info);
119 148
120struct arch_shared_info { 149struct arch_shared_info {
121 unsigned long max_pfn; /* max pfn that appears in table */ 150 /*
122 /* Frame containing list of mfns containing list of mfns containing p2m. */ 151 * Number of valid entries in the p2m table(s) anchored at
123 unsigned long pfn_to_mfn_frame_list_list; 152 * pfn_to_mfn_frame_list_list and/or p2m_vaddr.
124 unsigned long nmi_reason; 153 */
154 unsigned long max_pfn;
155 /*
156 * Frame containing list of mfns containing list of mfns containing p2m.
157 * A value of 0 indicates it has not yet been set up, ~0 indicates it
158 * has been set to invalid e.g. due to the p2m being too large for the
159 * 3-level p2m tree. In this case the linear mapper p2m list anchored
160 * at p2m_vaddr is to be used.
161 */
162 xen_pfn_t pfn_to_mfn_frame_list_list;
163 unsigned long nmi_reason;
164 /*
165 * Following three fields are valid if p2m_cr3 contains a value
166 * different from 0.
167 * p2m_cr3 is the root of the address space where p2m_vaddr is valid.
168 * p2m_cr3 is in the same format as a cr3 value in the vcpu register
169 * state and holds the folded machine frame number (via xen_pfn_to_cr3)
170 * of a L3 or L4 page table.
171 * p2m_vaddr holds the virtual address of the linear p2m list. All
172 * entries in the range [0...max_pfn[ are accessible via this pointer.
173 * p2m_generation will be incremented by the guest before and after each
174 * change of the mappings of the p2m list. p2m_generation starts at 0
175 * and a value with the least significant bit set indicates that a
176 * mapping update is in progress. This allows guest external software
177 * (e.g. in Dom0) to verify that read mappings are consistent and
178 * whether they have changed since the last check.
179 * Modifying a p2m element in the linear p2m list is allowed via an
180 * atomic write only.
181 */
182 unsigned long p2m_cr3; /* cr3 value of the p2m address space */
183 unsigned long p2m_vaddr; /* virtual address of the p2m list */
184 unsigned long p2m_generation; /* generation count of p2m mapping */
125}; 185};
126#endif /* !__ASSEMBLY__ */ 186#endif /* !__ASSEMBLY__ */
127 187
@@ -137,13 +197,31 @@ struct arch_shared_info {
137/* 197/*
138 * The following is all CPU context. Note that the fpu_ctxt block is filled 198 * The following is all CPU context. Note that the fpu_ctxt block is filled
139 * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. 199 * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
200 *
201 * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise
202 * for HVM and PVH guests, not all information in this structure is updated:
203 *
204 * - For HVM guests, the structures read include: fpu_ctxt (if
205 * VGCT_I387_VALID is set), flags, user_regs, debugreg[*]
206 *
207 * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to
208 * set cr3. All other fields not used should be set to 0.
140 */ 209 */
141struct vcpu_guest_context { 210struct vcpu_guest_context {
142 /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ 211 /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
143 struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ 212 struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
144#define VGCF_I387_VALID (1<<0) 213#define VGCF_I387_VALID (1<<0)
145#define VGCF_HVM_GUEST (1<<1) 214#define VGCF_IN_KERNEL (1<<2)
146#define VGCF_IN_KERNEL (1<<2) 215#define _VGCF_i387_valid 0
216#define VGCF_i387_valid (1<<_VGCF_i387_valid)
217#define _VGCF_in_kernel 2
218#define VGCF_in_kernel (1<<_VGCF_in_kernel)
219#define _VGCF_failsafe_disables_events 3
220#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
221#define _VGCF_syscall_disables_events 4
222#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
223#define _VGCF_online 5
224#define VGCF_online (1<<_VGCF_online)
147 unsigned long flags; /* VGCF_* flags */ 225 unsigned long flags; /* VGCF_* flags */
148 struct cpu_user_regs user_regs; /* User-level CPU registers */ 226 struct cpu_user_regs user_regs; /* User-level CPU registers */
149 struct trap_info trap_ctxt[256]; /* Virtual IDT */ 227 struct trap_info trap_ctxt[256]; /* Virtual IDT */
@@ -172,6 +250,129 @@ struct vcpu_guest_context {
172#endif 250#endif
173}; 251};
174DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); 252DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
253
254/* AMD PMU registers and structures */
255struct xen_pmu_amd_ctxt {
256 /*
257 * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd).
258 * For PV(H) guests these fields are RO.
259 */
260 uint32_t counters;
261 uint32_t ctrls;
262
263 /* Counter MSRs */
264#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
265 uint64_t regs[];
266#elif defined(__GNUC__)
267 uint64_t regs[0];
268#endif
269};
270
271/* Intel PMU registers and structures */
272struct xen_pmu_cntr_pair {
273 uint64_t counter;
274 uint64_t control;
275};
276
277struct xen_pmu_intel_ctxt {
278 /*
279 * Offsets to fixed and architectural counter MSRs (relative to
280 * xen_pmu_arch.c.intel).
281 * For PV(H) guests these fields are RO.
282 */
283 uint32_t fixed_counters;
284 uint32_t arch_counters;
285
286 /* PMU registers */
287 uint64_t global_ctrl;
288 uint64_t global_ovf_ctrl;
289 uint64_t global_status;
290 uint64_t fixed_ctrl;
291 uint64_t ds_area;
292 uint64_t pebs_enable;
293 uint64_t debugctl;
294
295 /* Fixed and architectural counter MSRs */
296#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
297 uint64_t regs[];
298#elif defined(__GNUC__)
299 uint64_t regs[0];
300#endif
301};
302
303/* Sampled domain's registers */
304struct xen_pmu_regs {
305 uint64_t ip;
306 uint64_t sp;
307 uint64_t flags;
308 uint16_t cs;
309 uint16_t ss;
310 uint8_t cpl;
311 uint8_t pad[3];
312};
313
314/* PMU flags */
315#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */
316#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */
317#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */
318#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */
319
320/*
321 * Architecture-specific information describing state of the processor at
322 * the time of PMU interrupt.
323 * Fields of this structure marked as RW for guest should only be written by
324 * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the
325 * hypervisor during PMU interrupt). Hypervisor will read updated data in
326 * XENPMU_flush hypercall and clear PMU_CACHED bit.
327 */
328struct xen_pmu_arch {
329 union {
330 /*
331 * Processor's registers at the time of interrupt.
332 * WO for hypervisor, RO for guests.
333 */
334 struct xen_pmu_regs regs;
335 /*
336 * Padding for adding new registers to xen_pmu_regs in
337 * the future
338 */
339#define XENPMU_REGS_PAD_SZ 64
340 uint8_t pad[XENPMU_REGS_PAD_SZ];
341 } r;
342
343 /* WO for hypervisor, RO for guest */
344 uint64_t pmu_flags;
345
346 /*
347 * APIC LVTPC register.
348 * RW for both hypervisor and guest.
349 * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware
350 * during XENPMU_flush or XENPMU_lvtpc_set.
351 */
352 union {
353 uint32_t lapic_lvtpc;
354 uint64_t pad;
355 } l;
356
357 /*
358 * Vendor-specific PMU registers.
359 * RW for both hypervisor and guest (see exceptions above).
360 * Guest's updates to this field are verified and then loaded by the
361 * hypervisor into hardware during XENPMU_flush
362 */
363 union {
364 struct xen_pmu_amd_ctxt amd;
365 struct xen_pmu_intel_ctxt intel;
366
367 /*
368 * Padding for contexts (fixed parts only, does not include
369 * MSR banks that are specified by offsets)
370 */
371#define XENPMU_CTXT_PAD_SZ 128
372 uint8_t pad[XENPMU_CTXT_PAD_SZ];
373 } c;
374};
375
175#endif /* !__ASSEMBLY__ */ 376#endif /* !__ASSEMBLY__ */
176 377
177/* 378/*
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c44a5d53e464..a3804fbe1f36 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -35,9 +35,7 @@ typedef struct xpaddr {
35#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) 35#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
36#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) 36#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
37 37
38/* Maximum amount of memory we can handle in a domain in pages */ 38#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
39#define MAX_DOMAIN_PAGES \
40 ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
41 39
42extern unsigned long *machine_to_phys_mapping; 40extern unsigned long *machine_to_phys_mapping;
43extern unsigned long machine_to_phys_nr; 41extern unsigned long machine_to_phys_nr;
@@ -48,8 +46,8 @@ extern unsigned long xen_max_p2m_pfn;
48extern unsigned long get_phys_to_machine(unsigned long pfn); 46extern unsigned long get_phys_to_machine(unsigned long pfn);
49extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 47extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
50extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); 48extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
51extern unsigned long set_phys_range_identity(unsigned long pfn_s, 49extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
52 unsigned long pfn_e); 50 unsigned long pfn_e);
53 51
54extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, 52extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
55 struct gnttab_map_grant_ref *kmap_ops, 53 struct gnttab_map_grant_ref *kmap_ops,
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 484145368a24..c7b15f3e2cf3 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -7,6 +7,7 @@ config XEN
7 depends on PARAVIRT 7 depends on PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 select XEN_HAVE_PVMMU 9 select XEN_HAVE_PVMMU
10 select XEN_HAVE_VPMU
10 depends on X86_64 || (X86_32 && X86_PAE) 11 depends on X86_64 || (X86_32 && X86_PAE)
11 depends on X86_LOCAL_APIC && X86_TSC 12 depends on X86_LOCAL_APIC && X86_TSC
12 help 13 help
@@ -23,14 +24,18 @@ config XEN_PVHVM
23 def_bool y 24 def_bool y
24 depends on XEN && PCI && X86_LOCAL_APIC 25 depends on XEN && PCI && X86_LOCAL_APIC
25 26
26config XEN_MAX_DOMAIN_MEMORY 27config XEN_512GB
27 int 28 bool "Limit Xen pv-domain memory to 512GB"
28 default 500 if X86_64 29 depends on XEN && X86_64
29 default 64 if X86_32 30 default y
30 depends on XEN 31 help
31 help 32 Limit paravirtualized user domains to 512GB of RAM.
32 This only affects the sizing of some bss arrays, the unused 33
33 portions of which are freed. 34 The Xen tools and crash dump analysis tools might not support
35 pv-domains with more than 512 GB of RAM. This option controls the
36 default setting of the kernel to use only up to 512 GB or more.
37 It is always possible to change the default via specifying the
38 boot parameter "xen_512gb_limit".
34 39
35config XEN_SAVE_RESTORE 40config XEN_SAVE_RESTORE
36 bool 41 bool
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 4b6e29ac0968..e47e52787d32 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp)
13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
14 time.o xen-asm.o xen-asm_$(BITS).o \ 14 time.o xen-asm.o xen-asm_$(BITS).o \
15 grant-table.o suspend.o platform-pci-unplug.o \ 15 grant-table.o suspend.o platform-pci-unplug.o \
16 p2m.o apic.o 16 p2m.o apic.o pmu.o
17 17
18obj-$(CONFIG_EVENT_TRACING) += trace.o 18obj-$(CONFIG_EVENT_TRACING) += trace.o
19 19
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 70e060ad879a..acda713ab5be 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -7,6 +7,7 @@
7#include <xen/xen.h> 7#include <xen/xen.h>
8#include <xen/interface/physdev.h> 8#include <xen/interface/physdev.h>
9#include "xen-ops.h" 9#include "xen-ops.h"
10#include "pmu.h"
10#include "smp.h" 11#include "smp.h"
11 12
12static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) 13static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
@@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg)
72 73
73static void xen_apic_write(u32 reg, u32 val) 74static void xen_apic_write(u32 reg, u32 val)
74{ 75{
76 if (reg == APIC_LVTPC) {
77 (void)pmu_apic_update(reg);
78 return;
79 }
80
75 /* Warn to see if there's any stray references */ 81 /* Warn to see if there's any stray references */
76 WARN(1,"register: %x, value: %x\n", reg, val); 82 WARN(1,"register: %x, value: %x\n", reg, val);
77} 83}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index d9cfa452da9d..30d12afe52ed 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -84,6 +84,7 @@
84#include "mmu.h" 84#include "mmu.h"
85#include "smp.h" 85#include "smp.h"
86#include "multicalls.h" 86#include "multicalls.h"
87#include "pmu.h"
87 88
88EXPORT_SYMBOL_GPL(hypercall_page); 89EXPORT_SYMBOL_GPL(hypercall_page);
89 90
@@ -1010,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0)
1010 1011
1011static void xen_write_cr4(unsigned long cr4) 1012static void xen_write_cr4(unsigned long cr4)
1012{ 1013{
1013 cr4 &= ~X86_CR4_PGE; 1014 cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
1014 cr4 &= ~X86_CR4_PSE;
1015 1015
1016 native_write_cr4(cr4); 1016 native_write_cr4(cr4);
1017} 1017}
@@ -1030,6 +1030,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
1030{ 1030{
1031 u64 val; 1031 u64 val;
1032 1032
1033 if (pmu_msr_read(msr, &val, err))
1034 return val;
1035
1033 val = native_read_msr_safe(msr, err); 1036 val = native_read_msr_safe(msr, err);
1034 switch (msr) { 1037 switch (msr) {
1035 case MSR_IA32_APICBASE: 1038 case MSR_IA32_APICBASE:
@@ -1076,7 +1079,8 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
1076 Xen console noise. */ 1079 Xen console noise. */
1077 1080
1078 default: 1081 default:
1079 ret = native_write_msr_safe(msr, low, high); 1082 if (!pmu_msr_write(msr, low, high, &ret))
1083 ret = native_write_msr_safe(msr, low, high);
1080 } 1084 }
1081 1085
1082 return ret; 1086 return ret;
@@ -1215,7 +1219,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
1215 .read_msr = xen_read_msr_safe, 1219 .read_msr = xen_read_msr_safe,
1216 .write_msr = xen_write_msr_safe, 1220 .write_msr = xen_write_msr_safe,
1217 1221
1218 .read_pmc = native_read_pmc, 1222 .read_pmc = xen_read_pmc,
1219 1223
1220 .iret = xen_iret, 1224 .iret = xen_iret,
1221#ifdef CONFIG_X86_64 1225#ifdef CONFIG_X86_64
@@ -1264,6 +1268,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = {
1264static void xen_reboot(int reason) 1268static void xen_reboot(int reason)
1265{ 1269{
1266 struct sched_shutdown r = { .reason = reason }; 1270 struct sched_shutdown r = { .reason = reason };
1271 int cpu;
1272
1273 for_each_online_cpu(cpu)
1274 xen_pmu_finish(cpu);
1267 1275
1268 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) 1276 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
1269 BUG(); 1277 BUG();
@@ -1607,7 +1615,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
1607 early_boot_irqs_disabled = true; 1615 early_boot_irqs_disabled = true;
1608 1616
1609 xen_raw_console_write("mapping kernel into physical memory\n"); 1617 xen_raw_console_write("mapping kernel into physical memory\n");
1610 xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); 1618 xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
1619 xen_start_info->nr_pages);
1620 xen_reserve_special_pages();
1611 1621
1612 /* 1622 /*
1613 * Modify the cache mode translation tables to match Xen's PAT 1623 * Modify the cache mode translation tables to match Xen's PAT
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index dd151b2045b0..2c50b445884e 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
116DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ 116DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
117DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ 117DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
118 118
119static phys_addr_t xen_pt_base, xen_pt_size __initdata;
119 120
120/* 121/*
121 * Just beyond the highest usermode address. STACK_TOP_MAX has a 122 * Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm)
1093 1094
1094static void xen_post_allocator_init(void); 1095static void xen_post_allocator_init(void);
1095 1096
1097static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1098{
1099 struct mmuext_op op;
1100
1101 op.cmd = cmd;
1102 op.arg1.mfn = pfn_to_mfn(pfn);
1103 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1104 BUG();
1105}
1106
1096#ifdef CONFIG_X86_64 1107#ifdef CONFIG_X86_64
1097static void __init xen_cleanhighmap(unsigned long vaddr, 1108static void __init xen_cleanhighmap(unsigned long vaddr,
1098 unsigned long vaddr_end) 1109 unsigned long vaddr_end)
@@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
1114 xen_mc_flush(); 1125 xen_mc_flush();
1115} 1126}
1116 1127
1128/*
1129 * Make a page range writeable and free it.
1130 */
1131static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
1132{
1133 void *vaddr = __va(paddr);
1134 void *vaddr_end = vaddr + size;
1135
1136 for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
1137 make_lowmem_page_readwrite(vaddr);
1138
1139 memblock_free(paddr, size);
1140}
1141
1142static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
1143{
1144 unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
1145
1146 if (unpin)
1147 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
1148 ClearPagePinned(virt_to_page(__va(pa)));
1149 xen_free_ro_pages(pa, PAGE_SIZE);
1150}
1151
1152/*
1153 * Since it is well isolated we can (and since it is perhaps large we should)
1154 * also free the page tables mapping the initial P->M table.
1155 */
1156static void __init xen_cleanmfnmap(unsigned long vaddr)
1157{
1158 unsigned long va = vaddr & PMD_MASK;
1159 unsigned long pa;
1160 pgd_t *pgd = pgd_offset_k(va);
1161 pud_t *pud_page = pud_offset(pgd, 0);
1162 pud_t *pud;
1163 pmd_t *pmd;
1164 pte_t *pte;
1165 unsigned int i;
1166 bool unpin;
1167
1168 unpin = (vaddr == 2 * PGDIR_SIZE);
1169 set_pgd(pgd, __pgd(0));
1170 do {
1171 pud = pud_page + pud_index(va);
1172 if (pud_none(*pud)) {
1173 va += PUD_SIZE;
1174 } else if (pud_large(*pud)) {
1175 pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
1176 xen_free_ro_pages(pa, PUD_SIZE);
1177 va += PUD_SIZE;
1178 } else {
1179 pmd = pmd_offset(pud, va);
1180 if (pmd_large(*pmd)) {
1181 pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
1182 xen_free_ro_pages(pa, PMD_SIZE);
1183 } else if (!pmd_none(*pmd)) {
1184 pte = pte_offset_kernel(pmd, va);
1185 set_pmd(pmd, __pmd(0));
1186 for (i = 0; i < PTRS_PER_PTE; ++i) {
1187 if (pte_none(pte[i]))
1188 break;
1189 pa = pte_pfn(pte[i]) << PAGE_SHIFT;
1190 xen_free_ro_pages(pa, PAGE_SIZE);
1191 }
1192 xen_cleanmfnmap_free_pgtbl(pte, unpin);
1193 }
1194 va += PMD_SIZE;
1195 if (pmd_index(va))
1196 continue;
1197 set_pud(pud, __pud(0));
1198 xen_cleanmfnmap_free_pgtbl(pmd, unpin);
1199 }
1200
1201 } while (pud_index(va) || pmd_index(va));
1202 xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
1203}
1204
1117static void __init xen_pagetable_p2m_free(void) 1205static void __init xen_pagetable_p2m_free(void)
1118{ 1206{
1119 unsigned long size; 1207 unsigned long size;
@@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void)
1128 /* using __ka address and sticking INVALID_P2M_ENTRY! */ 1216 /* using __ka address and sticking INVALID_P2M_ENTRY! */
1129 memset((void *)xen_start_info->mfn_list, 0xff, size); 1217 memset((void *)xen_start_info->mfn_list, 0xff, size);
1130 1218
1131 /* We should be in __ka space. */
1132 BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
1133 addr = xen_start_info->mfn_list; 1219 addr = xen_start_info->mfn_list;
1134 /* We roundup to the PMD, which means that if anybody at this stage is 1220 /*
1135 * using the __ka address of xen_start_info or xen_start_info->shared_info 1221 * We could be in __ka space.
1136 * they are in going to crash. Fortunatly we have already revectored 1222 * We roundup to the PMD, which means that if anybody at this stage is
1137 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ 1223 * using the __ka address of xen_start_info or
1224 * xen_start_info->shared_info they are in going to crash. Fortunatly
1225 * we have already revectored in xen_setup_kernel_pagetable and in
1226 * xen_setup_shared_info.
1227 */
1138 size = roundup(size, PMD_SIZE); 1228 size = roundup(size, PMD_SIZE);
1139 xen_cleanhighmap(addr, addr + size);
1140 1229
1141 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 1230 if (addr >= __START_KERNEL_map) {
1142 memblock_free(__pa(xen_start_info->mfn_list), size); 1231 xen_cleanhighmap(addr, addr + size);
1232 size = PAGE_ALIGN(xen_start_info->nr_pages *
1233 sizeof(unsigned long));
1234 memblock_free(__pa(addr), size);
1235 } else {
1236 xen_cleanmfnmap(addr);
1237 }
1238}
1239
1240static void __init xen_pagetable_cleanhighmap(void)
1241{
1242 unsigned long size;
1243 unsigned long addr;
1143 1244
1144 /* At this stage, cleanup_highmap has already cleaned __ka space 1245 /* At this stage, cleanup_highmap has already cleaned __ka space
1145 * from _brk_limit way up to the max_pfn_mapped (which is the end of 1246 * from _brk_limit way up to the max_pfn_mapped (which is the end of
@@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void)
1172 1273
1173#ifdef CONFIG_X86_64 1274#ifdef CONFIG_X86_64
1174 xen_pagetable_p2m_free(); 1275 xen_pagetable_p2m_free();
1276
1277 xen_pagetable_cleanhighmap();
1175#endif 1278#endif
1176 /* And revector! Bye bye old array */ 1279 /* And revector! Bye bye old array */
1177 xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; 1280 xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
@@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1461#else /* CONFIG_X86_64 */ 1564#else /* CONFIG_X86_64 */
1462static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) 1565static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1463{ 1566{
1567 unsigned long pfn;
1568
1569 if (xen_feature(XENFEAT_writable_page_tables) ||
1570 xen_feature(XENFEAT_auto_translated_physmap) ||
1571 xen_start_info->mfn_list >= __START_KERNEL_map)
1572 return pte;
1573
1574 /*
1575 * Pages belonging to the initial p2m list mapped outside the default
1576 * address range must be mapped read-only. This region contains the
1577 * page tables for mapping the p2m list, too, and page tables MUST be
1578 * mapped read-only.
1579 */
1580 pfn = pte_pfn(pte);
1581 if (pfn >= xen_start_info->first_p2m_pfn &&
1582 pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
1583 pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
1584
1464 return pte; 1585 return pte;
1465} 1586}
1466#endif /* CONFIG_X86_64 */ 1587#endif /* CONFIG_X86_64 */
@@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1489 native_set_pte(ptep, pte); 1610 native_set_pte(ptep, pte);
1490} 1611}
1491 1612
1492static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1493{
1494 struct mmuext_op op;
1495 op.cmd = cmd;
1496 op.arg1.mfn = pfn_to_mfn(pfn);
1497 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1498 BUG();
1499}
1500
1501/* Early in boot, while setting up the initial pagetable, assume 1613/* Early in boot, while setting up the initial pagetable, assume
1502 everything is pinned. */ 1614 everything is pinned. */
1503static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) 1615static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
@@ -1815,7 +1927,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1815 * mappings. Considering that on Xen after the kernel mappings we 1927 * mappings. Considering that on Xen after the kernel mappings we
1816 * have the mappings of some pages that don't exist in pfn space, we 1928 * have the mappings of some pages that don't exist in pfn space, we
1817 * set max_pfn_mapped to the last real pfn mapped. */ 1929 * set max_pfn_mapped to the last real pfn mapped. */
1818 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); 1930 if (xen_start_info->mfn_list < __START_KERNEL_map)
1931 max_pfn_mapped = xen_start_info->first_p2m_pfn;
1932 else
1933 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1819 1934
1820 pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); 1935 pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1821 pt_end = pt_base + xen_start_info->nr_pt_frames; 1936 pt_end = pt_base + xen_start_info->nr_pt_frames;
@@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1855 /* Graft it onto L4[511][510] */ 1970 /* Graft it onto L4[511][510] */
1856 copy_page(level2_kernel_pgt, l2); 1971 copy_page(level2_kernel_pgt, l2);
1857 1972
1973 /* Copy the initial P->M table mappings if necessary. */
1974 i = pgd_index(xen_start_info->mfn_list);
1975 if (i && i < pgd_index(__START_KERNEL_map))
1976 init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
1977
1858 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1978 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1859 /* Make pagetable pieces RO */ 1979 /* Make pagetable pieces RO */
1860 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); 1980 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
@@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1894 check_pt_base(&pt_base, &pt_end, addr[i]); 2014 check_pt_base(&pt_base, &pt_end, addr[i]);
1895 2015
1896 /* Our (by three pages) smaller Xen pagetable that we are using */ 2016 /* Our (by three pages) smaller Xen pagetable that we are using */
1897 memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); 2017 xen_pt_base = PFN_PHYS(pt_base);
2018 xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
2019 memblock_reserve(xen_pt_base, xen_pt_size);
2020
1898 /* Revector the xen_start_info */ 2021 /* Revector the xen_start_info */
1899 xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); 2022 xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
1900} 2023}
2024
2025/*
2026 * Read a value from a physical address.
2027 */
2028static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
2029{
2030 unsigned long *vaddr;
2031 unsigned long val;
2032
2033 vaddr = early_memremap_ro(addr, sizeof(val));
2034 val = *vaddr;
2035 early_memunmap(vaddr, sizeof(val));
2036 return val;
2037}
2038
2039/*
2040 * Translate a virtual address to a physical one without relying on mapped
2041 * page tables.
2042 */
2043static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
2044{
2045 phys_addr_t pa;
2046 pgd_t pgd;
2047 pud_t pud;
2048 pmd_t pmd;
2049 pte_t pte;
2050
2051 pa = read_cr3();
2052 pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
2053 sizeof(pgd)));
2054 if (!pgd_present(pgd))
2055 return 0;
2056
2057 pa = pgd_val(pgd) & PTE_PFN_MASK;
2058 pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
2059 sizeof(pud)));
2060 if (!pud_present(pud))
2061 return 0;
2062 pa = pud_pfn(pud) << PAGE_SHIFT;
2063 if (pud_large(pud))
2064 return pa + (vaddr & ~PUD_MASK);
2065
2066 pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
2067 sizeof(pmd)));
2068 if (!pmd_present(pmd))
2069 return 0;
2070 pa = pmd_pfn(pmd) << PAGE_SHIFT;
2071 if (pmd_large(pmd))
2072 return pa + (vaddr & ~PMD_MASK);
2073
2074 pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
2075 sizeof(pte)));
2076 if (!pte_present(pte))
2077 return 0;
2078 pa = pte_pfn(pte) << PAGE_SHIFT;
2079
2080 return pa | (vaddr & ~PAGE_MASK);
2081}
2082
2083/*
2084 * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
2085 * this area.
2086 */
2087void __init xen_relocate_p2m(void)
2088{
2089 phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
2090 unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
2091 int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
2092 pte_t *pt;
2093 pmd_t *pmd;
2094 pud_t *pud;
2095 pgd_t *pgd;
2096 unsigned long *new_p2m;
2097
2098 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
2099 n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
2100 n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
2101 n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
2102 n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
2103 n_frames = n_pte + n_pt + n_pmd + n_pud;
2104
2105 new_area = xen_find_free_area(PFN_PHYS(n_frames));
2106 if (!new_area) {
2107 xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
2108 BUG();
2109 }
2110
2111 /*
2112 * Setup the page tables for addressing the new p2m list.
2113 * We have asked the hypervisor to map the p2m list at the user address
2114 * PUD_SIZE. It may have done so, or it may have used a kernel space
2115 * address depending on the Xen version.
2116 * To avoid any possible virtual address collision, just use
2117 * 2 * PUD_SIZE for the new area.
2118 */
2119 pud_phys = new_area;
2120 pmd_phys = pud_phys + PFN_PHYS(n_pud);
2121 pt_phys = pmd_phys + PFN_PHYS(n_pmd);
2122 p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
2123
2124 pgd = __va(read_cr3());
2125 new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
2126 for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
2127 pud = early_memremap(pud_phys, PAGE_SIZE);
2128 clear_page(pud);
2129 for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
2130 idx_pmd++) {
2131 pmd = early_memremap(pmd_phys, PAGE_SIZE);
2132 clear_page(pmd);
2133 for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
2134 idx_pt++) {
2135 pt = early_memremap(pt_phys, PAGE_SIZE);
2136 clear_page(pt);
2137 for (idx_pte = 0;
2138 idx_pte < min(n_pte, PTRS_PER_PTE);
2139 idx_pte++) {
2140 set_pte(pt + idx_pte,
2141 pfn_pte(p2m_pfn, PAGE_KERNEL));
2142 p2m_pfn++;
2143 }
2144 n_pte -= PTRS_PER_PTE;
2145 early_memunmap(pt, PAGE_SIZE);
2146 make_lowmem_page_readonly(__va(pt_phys));
2147 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
2148 PFN_DOWN(pt_phys));
2149 set_pmd(pmd + idx_pt,
2150 __pmd(_PAGE_TABLE | pt_phys));
2151 pt_phys += PAGE_SIZE;
2152 }
2153 n_pt -= PTRS_PER_PMD;
2154 early_memunmap(pmd, PAGE_SIZE);
2155 make_lowmem_page_readonly(__va(pmd_phys));
2156 pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
2157 PFN_DOWN(pmd_phys));
2158 set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
2159 pmd_phys += PAGE_SIZE;
2160 }
2161 n_pmd -= PTRS_PER_PUD;
2162 early_memunmap(pud, PAGE_SIZE);
2163 make_lowmem_page_readonly(__va(pud_phys));
2164 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
2165 set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
2166 pud_phys += PAGE_SIZE;
2167 }
2168
2169 /* Now copy the old p2m info to the new area. */
2170 memcpy(new_p2m, xen_p2m_addr, size);
2171 xen_p2m_addr = new_p2m;
2172
2173 /* Release the old p2m list and set new list info. */
2174 p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
2175 BUG_ON(!p2m_pfn);
2176 p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
2177
2178 if (xen_start_info->mfn_list < __START_KERNEL_map) {
2179 pfn = xen_start_info->first_p2m_pfn;
2180 pfn_end = xen_start_info->first_p2m_pfn +
2181 xen_start_info->nr_p2m_frames;
2182 set_pgd(pgd + 1, __pgd(0));
2183 } else {
2184 pfn = p2m_pfn;
2185 pfn_end = p2m_pfn_end;
2186 }
2187
2188 memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
2189 while (pfn < pfn_end) {
2190 if (pfn == p2m_pfn) {
2191 pfn = p2m_pfn_end;
2192 continue;
2193 }
2194 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
2195 pfn++;
2196 }
2197
2198 xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
2199 xen_start_info->first_p2m_pfn = PFN_DOWN(new_area);
2200 xen_start_info->nr_p2m_frames = n_frames;
2201}
2202
1901#else /* !CONFIG_X86_64 */ 2203#else /* !CONFIG_X86_64 */
1902static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); 2204static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1903static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); 2205static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
@@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3)
1938 pv_mmu_ops.write_cr3 = &xen_write_cr3; 2240 pv_mmu_ops.write_cr3 = &xen_write_cr3;
1939} 2241}
1940 2242
2243/*
2244 * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
2245 * not the first page table in the page table pool.
2246 * Iterate through the initial page tables to find the real page table base.
2247 */
2248static phys_addr_t xen_find_pt_base(pmd_t *pmd)
2249{
2250 phys_addr_t pt_base, paddr;
2251 unsigned pmdidx;
2252
2253 pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
2254
2255 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
2256 if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
2257 paddr = m2p(pmd[pmdidx].pmd);
2258 pt_base = min(pt_base, paddr);
2259 }
2260
2261 return pt_base;
2262}
2263
1941void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 2264void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1942{ 2265{
1943 pmd_t *kernel_pmd; 2266 pmd_t *kernel_pmd;
1944 2267
2268 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
2269
2270 xen_pt_base = xen_find_pt_base(kernel_pmd);
2271 xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
2272
1945 initial_kernel_pmd = 2273 initial_kernel_pmd =
1946 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 2274 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1947 2275
1948 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + 2276 max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
1949 xen_start_info->nr_pt_frames * PAGE_SIZE +
1950 512*1024);
1951 2277
1952 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1953 copy_page(initial_kernel_pmd, kernel_pmd); 2278 copy_page(initial_kernel_pmd, kernel_pmd);
1954 2279
1955 xen_map_identity_early(initial_kernel_pmd, max_pfn); 2280 xen_map_identity_early(initial_kernel_pmd, max_pfn);
@@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1968 PFN_DOWN(__pa(initial_page_table))); 2293 PFN_DOWN(__pa(initial_page_table)));
1969 xen_write_cr3(__pa(initial_page_table)); 2294 xen_write_cr3(__pa(initial_page_table));
1970 2295
1971 memblock_reserve(__pa(xen_start_info->pt_base), 2296 memblock_reserve(xen_pt_base, xen_pt_size);
1972 xen_start_info->nr_pt_frames * PAGE_SIZE);
1973} 2297}
1974#endif /* CONFIG_X86_64 */ 2298#endif /* CONFIG_X86_64 */
1975 2299
2300void __init xen_reserve_special_pages(void)
2301{
2302 phys_addr_t paddr;
2303
2304 memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
2305 if (xen_start_info->store_mfn) {
2306 paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
2307 memblock_reserve(paddr, PAGE_SIZE);
2308 }
2309 if (!xen_initial_domain()) {
2310 paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
2311 memblock_reserve(paddr, PAGE_SIZE);
2312 }
2313}
2314
2315void __init xen_pt_check_e820(void)
2316{
2317 if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
2318 xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
2319 BUG();
2320 }
2321}
2322
1976static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; 2323static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1977 2324
1978static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) 2325static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 8b7f18e200aa..bfc08b13044b 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -79,10 +79,14 @@
79#include <xen/balloon.h> 79#include <xen/balloon.h>
80#include <xen/grant_table.h> 80#include <xen/grant_table.h>
81 81
82#include "p2m.h"
83#include "multicalls.h" 82#include "multicalls.h"
84#include "xen-ops.h" 83#include "xen-ops.h"
85 84
85#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
86#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
87
88#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
89
86#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE) 90#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE)
87 91
88unsigned long *xen_p2m_addr __read_mostly; 92unsigned long *xen_p2m_addr __read_mostly;
@@ -199,7 +203,8 @@ void __ref xen_build_mfn_list_list(void)
199 unsigned int level, topidx, mididx; 203 unsigned int level, topidx, mididx;
200 unsigned long *mid_mfn_p; 204 unsigned long *mid_mfn_p;
201 205
202 if (xen_feature(XENFEAT_auto_translated_physmap)) 206 if (xen_feature(XENFEAT_auto_translated_physmap) ||
207 xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
203 return; 208 return;
204 209
205 /* Pre-initialize p2m_top_mfn to be completely missing */ 210 /* Pre-initialize p2m_top_mfn to be completely missing */
@@ -260,9 +265,16 @@ void xen_setup_mfn_list_list(void)
260 265
261 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 266 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
262 267
263 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = 268 if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
264 virt_to_mfn(p2m_top_mfn); 269 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL;
270 else
271 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
272 virt_to_mfn(p2m_top_mfn);
265 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; 273 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
274 HYPERVISOR_shared_info->arch.p2m_generation = 0;
275 HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr;
276 HYPERVISOR_shared_info->arch.p2m_cr3 =
277 xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
266} 278}
267 279
268/* Set up p2m_top to point to the domain-builder provided p2m pages */ 280/* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -478,8 +490,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
478 490
479 ptechk = lookup_address(vaddr, &level); 491 ptechk = lookup_address(vaddr, &level);
480 if (ptechk == pte_pg) { 492 if (ptechk == pte_pg) {
493 HYPERVISOR_shared_info->arch.p2m_generation++;
494 wmb(); /* Tools are synchronizing via p2m_generation. */
481 set_pmd(pmdp, 495 set_pmd(pmdp,
482 __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE)); 496 __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
497 wmb(); /* Tools are synchronizing via p2m_generation. */
498 HYPERVISOR_shared_info->arch.p2m_generation++;
483 pte_newpg[i] = NULL; 499 pte_newpg[i] = NULL;
484 } 500 }
485 501
@@ -505,7 +521,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
505 */ 521 */
506static bool alloc_p2m(unsigned long pfn) 522static bool alloc_p2m(unsigned long pfn)
507{ 523{
508 unsigned topidx, mididx; 524 unsigned topidx;
509 unsigned long *top_mfn_p, *mid_mfn; 525 unsigned long *top_mfn_p, *mid_mfn;
510 pte_t *ptep, *pte_pg; 526 pte_t *ptep, *pte_pg;
511 unsigned int level; 527 unsigned int level;
@@ -513,9 +529,6 @@ static bool alloc_p2m(unsigned long pfn)
513 unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); 529 unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
514 unsigned long p2m_pfn; 530 unsigned long p2m_pfn;
515 531
516 topidx = p2m_top_index(pfn);
517 mididx = p2m_mid_index(pfn);
518
519 ptep = lookup_address(addr, &level); 532 ptep = lookup_address(addr, &level);
520 BUG_ON(!ptep || level != PG_LEVEL_4K); 533 BUG_ON(!ptep || level != PG_LEVEL_4K);
521 pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); 534 pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -527,7 +540,8 @@ static bool alloc_p2m(unsigned long pfn)
527 return false; 540 return false;
528 } 541 }
529 542
530 if (p2m_top_mfn) { 543 if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
544 topidx = p2m_top_index(pfn);
531 top_mfn_p = &p2m_top_mfn[topidx]; 545 top_mfn_p = &p2m_top_mfn[topidx];
532 mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); 546 mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
533 547
@@ -577,10 +591,14 @@ static bool alloc_p2m(unsigned long pfn)
577 spin_lock_irqsave(&p2m_update_lock, flags); 591 spin_lock_irqsave(&p2m_update_lock, flags);
578 592
579 if (pte_pfn(*ptep) == p2m_pfn) { 593 if (pte_pfn(*ptep) == p2m_pfn) {
594 HYPERVISOR_shared_info->arch.p2m_generation++;
595 wmb(); /* Tools are synchronizing via p2m_generation. */
580 set_pte(ptep, 596 set_pte(ptep,
581 pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL)); 597 pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
598 wmb(); /* Tools are synchronizing via p2m_generation. */
599 HYPERVISOR_shared_info->arch.p2m_generation++;
582 if (mid_mfn) 600 if (mid_mfn)
583 mid_mfn[mididx] = virt_to_mfn(p2m); 601 mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m);
584 p2m = NULL; 602 p2m = NULL;
585 } 603 }
586 604
@@ -630,6 +648,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
630 return true; 648 return true;
631 } 649 }
632 650
651 /*
652 * The interface requires atomic updates on p2m elements.
653 * xen_safe_write_ulong() is using __put_user which does an atomic
654 * store via asm().
655 */
633 if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn))) 656 if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
634 return true; 657 return true;
635 658
diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h
deleted file mode 100644
index ad8aee24ab72..000000000000
--- a/arch/x86/xen/p2m.h
+++ /dev/null
@@ -1,15 +0,0 @@
1#ifndef _XEN_P2M_H
2#define _XEN_P2M_H
3
4#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
5#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
6#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
7
8#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
9
10#define MAX_REMAP_RANGES 10
11
12extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
13 unsigned long pfn_e);
14
15#endif /* _XEN_P2M_H */
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index a8261716d58d..9586ff32810c 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -68,7 +68,7 @@ static int check_platform_magic(void)
68 return 0; 68 return 0;
69} 69}
70 70
71bool xen_has_pv_devices() 71bool xen_has_pv_devices(void)
72{ 72{
73 if (!xen_domain()) 73 if (!xen_domain())
74 return false; 74 return false;
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
new file mode 100644
index 000000000000..724a08740a04
--- /dev/null
+++ b/arch/x86/xen/pmu.c
@@ -0,0 +1,570 @@
1#include <linux/types.h>
2#include <linux/interrupt.h>
3
4#include <asm/xen/hypercall.h>
5#include <xen/page.h>
6#include <xen/interface/xen.h>
7#include <xen/interface/vcpu.h>
8#include <xen/interface/xenpmu.h>
9
10#include "xen-ops.h"
11#include "pmu.h"
12
13/* x86_pmu.handle_irq definition */
14#include "../kernel/cpu/perf_event.h"
15
16#define XENPMU_IRQ_PROCESSING 1
17struct xenpmu {
18 /* Shared page between hypervisor and domain */
19 struct xen_pmu_data *xenpmu_data;
20
21 uint8_t flags;
22};
23static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared);
24#define get_xenpmu_data() (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
25#define get_xenpmu_flags() (this_cpu_ptr(&xenpmu_shared)->flags)
26
27/* Macro for computing address of a PMU MSR bank */
28#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \
29 (uintptr_t)ctxt->field))
30
31/* AMD PMU */
32#define F15H_NUM_COUNTERS 6
33#define F10H_NUM_COUNTERS 4
34
35static __read_mostly uint32_t amd_counters_base;
36static __read_mostly uint32_t amd_ctrls_base;
37static __read_mostly int amd_msr_step;
38static __read_mostly int k7_counters_mirrored;
39static __read_mostly int amd_num_counters;
40
41/* Intel PMU */
42#define MSR_TYPE_COUNTER 0
43#define MSR_TYPE_CTRL 1
44#define MSR_TYPE_GLOBAL 2
45#define MSR_TYPE_ARCH_COUNTER 3
46#define MSR_TYPE_ARCH_CTRL 4
47
48/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */
49#define PMU_GENERAL_NR_SHIFT 8
50#define PMU_GENERAL_NR_BITS 8
51#define PMU_GENERAL_NR_MASK (((1 << PMU_GENERAL_NR_BITS) - 1) \
52 << PMU_GENERAL_NR_SHIFT)
53
54/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */
55#define PMU_FIXED_NR_SHIFT 0
56#define PMU_FIXED_NR_BITS 5
57#define PMU_FIXED_NR_MASK (((1 << PMU_FIXED_NR_BITS) - 1) \
58 << PMU_FIXED_NR_SHIFT)
59
60/* Alias registers (0x4c1) for full-width writes to PMCs */
61#define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0))
62
63#define INTEL_PMC_TYPE_SHIFT 30
64
65static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;
66
67
68static void xen_pmu_arch_init(void)
69{
70 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
71
72 switch (boot_cpu_data.x86) {
73 case 0x15:
74 amd_num_counters = F15H_NUM_COUNTERS;
75 amd_counters_base = MSR_F15H_PERF_CTR;
76 amd_ctrls_base = MSR_F15H_PERF_CTL;
77 amd_msr_step = 2;
78 k7_counters_mirrored = 1;
79 break;
80 case 0x10:
81 case 0x12:
82 case 0x14:
83 case 0x16:
84 default:
85 amd_num_counters = F10H_NUM_COUNTERS;
86 amd_counters_base = MSR_K7_PERFCTR0;
87 amd_ctrls_base = MSR_K7_EVNTSEL0;
88 amd_msr_step = 1;
89 k7_counters_mirrored = 0;
90 break;
91 }
92 } else {
93 uint32_t eax, ebx, ecx, edx;
94
95 cpuid(0xa, &eax, &ebx, &ecx, &edx);
96
97 intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >>
98 PMU_GENERAL_NR_SHIFT;
99 intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >>
100 PMU_FIXED_NR_SHIFT;
101 }
102}
103
104static inline uint32_t get_fam15h_addr(u32 addr)
105{
106 switch (addr) {
107 case MSR_K7_PERFCTR0:
108 case MSR_K7_PERFCTR1:
109 case MSR_K7_PERFCTR2:
110 case MSR_K7_PERFCTR3:
111 return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0);
112 case MSR_K7_EVNTSEL0:
113 case MSR_K7_EVNTSEL1:
114 case MSR_K7_EVNTSEL2:
115 case MSR_K7_EVNTSEL3:
116 return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0);
117 default:
118 break;
119 }
120
121 return addr;
122}
123
124static inline bool is_amd_pmu_msr(unsigned int msr)
125{
126 if ((msr >= MSR_F15H_PERF_CTL &&
127 msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
128 (msr >= MSR_K7_EVNTSEL0 &&
129 msr < MSR_K7_PERFCTR0 + amd_num_counters))
130 return true;
131
132 return false;
133}
134
135static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
136{
137 u32 msr_index_pmc;
138
139 switch (msr_index) {
140 case MSR_CORE_PERF_FIXED_CTR_CTRL:
141 case MSR_IA32_DS_AREA:
142 case MSR_IA32_PEBS_ENABLE:
143 *type = MSR_TYPE_CTRL;
144 return true;
145
146 case MSR_CORE_PERF_GLOBAL_CTRL:
147 case MSR_CORE_PERF_GLOBAL_STATUS:
148 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
149 *type = MSR_TYPE_GLOBAL;
150 return true;
151
152 default:
153
154 if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) &&
155 (msr_index < MSR_CORE_PERF_FIXED_CTR0 +
156 intel_num_fixed_counters)) {
157 *index = msr_index - MSR_CORE_PERF_FIXED_CTR0;
158 *type = MSR_TYPE_COUNTER;
159 return true;
160 }
161
162 if ((msr_index >= MSR_P6_EVNTSEL0) &&
163 (msr_index < MSR_P6_EVNTSEL0 + intel_num_arch_counters)) {
164 *index = msr_index - MSR_P6_EVNTSEL0;
165 *type = MSR_TYPE_ARCH_CTRL;
166 return true;
167 }
168
169 msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK;
170 if ((msr_index_pmc >= MSR_IA32_PERFCTR0) &&
171 (msr_index_pmc < MSR_IA32_PERFCTR0 +
172 intel_num_arch_counters)) {
173 *type = MSR_TYPE_ARCH_COUNTER;
174 *index = msr_index_pmc - MSR_IA32_PERFCTR0;
175 return true;
176 }
177 return false;
178 }
179}
180
181static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
182 int index, bool is_read)
183{
184 uint64_t *reg = NULL;
185 struct xen_pmu_intel_ctxt *ctxt;
186 uint64_t *fix_counters;
187 struct xen_pmu_cntr_pair *arch_cntr_pair;
188 struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
189 uint8_t xenpmu_flags = get_xenpmu_flags();
190
191
192 if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
193 return false;
194
195 ctxt = &xenpmu_data->pmu.c.intel;
196
197 switch (msr) {
198 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
199 reg = &ctxt->global_ovf_ctrl;
200 break;
201 case MSR_CORE_PERF_GLOBAL_STATUS:
202 reg = &ctxt->global_status;
203 break;
204 case MSR_CORE_PERF_GLOBAL_CTRL:
205 reg = &ctxt->global_ctrl;
206 break;
207 case MSR_CORE_PERF_FIXED_CTR_CTRL:
208 reg = &ctxt->fixed_ctrl;
209 break;
210 default:
211 switch (type) {
212 case MSR_TYPE_COUNTER:
213 fix_counters = field_offset(ctxt, fixed_counters);
214 reg = &fix_counters[index];
215 break;
216 case MSR_TYPE_ARCH_COUNTER:
217 arch_cntr_pair = field_offset(ctxt, arch_counters);
218 reg = &arch_cntr_pair[index].counter;
219 break;
220 case MSR_TYPE_ARCH_CTRL:
221 arch_cntr_pair = field_offset(ctxt, arch_counters);
222 reg = &arch_cntr_pair[index].control;
223 break;
224 default:
225 return false;
226 }
227 }
228
229 if (reg) {
230 if (is_read)
231 *val = *reg;
232 else {
233 *reg = *val;
234
235 if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL)
236 ctxt->global_status &= (~(*val));
237 }
238 return true;
239 }
240
241 return false;
242}
243
244static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
245{
246 uint64_t *reg = NULL;
247 int i, off = 0;
248 struct xen_pmu_amd_ctxt *ctxt;
249 uint64_t *counter_regs, *ctrl_regs;
250 struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
251 uint8_t xenpmu_flags = get_xenpmu_flags();
252
253 if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
254 return false;
255
256 if (k7_counters_mirrored &&
257 ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)))
258 msr = get_fam15h_addr(msr);
259
260 ctxt = &xenpmu_data->pmu.c.amd;
261 for (i = 0; i < amd_num_counters; i++) {
262 if (msr == amd_ctrls_base + off) {
263 ctrl_regs = field_offset(ctxt, ctrls);
264 reg = &ctrl_regs[i];
265 break;
266 } else if (msr == amd_counters_base + off) {
267 counter_regs = field_offset(ctxt, counters);
268 reg = &counter_regs[i];
269 break;
270 }
271 off += amd_msr_step;
272 }
273
274 if (reg) {
275 if (is_read)
276 *val = *reg;
277 else
278 *reg = *val;
279
280 return true;
281 }
282 return false;
283}
284
285bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
286{
287 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
288 if (is_amd_pmu_msr(msr)) {
289 if (!xen_amd_pmu_emulate(msr, val, 1))
290 *val = native_read_msr_safe(msr, err);
291 return true;
292 }
293 } else {
294 int type, index;
295
296 if (is_intel_pmu_msr(msr, &type, &index)) {
297 if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
298 *val = native_read_msr_safe(msr, err);
299 return true;
300 }
301 }
302
303 return false;
304}
305
306bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
307{
308 uint64_t val = ((uint64_t)high << 32) | low;
309
310 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
311 if (is_amd_pmu_msr(msr)) {
312 if (!xen_amd_pmu_emulate(msr, &val, 0))
313 *err = native_write_msr_safe(msr, low, high);
314 return true;
315 }
316 } else {
317 int type, index;
318
319 if (is_intel_pmu_msr(msr, &type, &index)) {
320 if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
321 *err = native_write_msr_safe(msr, low, high);
322 return true;
323 }
324 }
325
326 return false;
327}
328
329static unsigned long long xen_amd_read_pmc(int counter)
330{
331 struct xen_pmu_amd_ctxt *ctxt;
332 uint64_t *counter_regs;
333 struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
334 uint8_t xenpmu_flags = get_xenpmu_flags();
335
336 if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
337 uint32_t msr;
338 int err;
339
340 msr = amd_counters_base + (counter * amd_msr_step);
341 return native_read_msr_safe(msr, &err);
342 }
343
344 ctxt = &xenpmu_data->pmu.c.amd;
345 counter_regs = field_offset(ctxt, counters);
346 return counter_regs[counter];
347}
348
349static unsigned long long xen_intel_read_pmc(int counter)
350{
351 struct xen_pmu_intel_ctxt *ctxt;
352 uint64_t *fixed_counters;
353 struct xen_pmu_cntr_pair *arch_cntr_pair;
354 struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
355 uint8_t xenpmu_flags = get_xenpmu_flags();
356
357 if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
358 uint32_t msr;
359 int err;
360
361 if (counter & (1 << INTEL_PMC_TYPE_SHIFT))
362 msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff);
363 else
364 msr = MSR_IA32_PERFCTR0 + counter;
365
366 return native_read_msr_safe(msr, &err);
367 }
368
369 ctxt = &xenpmu_data->pmu.c.intel;
370 if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) {
371 fixed_counters = field_offset(ctxt, fixed_counters);
372 return fixed_counters[counter & 0xffff];
373 }
374
375 arch_cntr_pair = field_offset(ctxt, arch_counters);
376 return arch_cntr_pair[counter].counter;
377}
378
379unsigned long long xen_read_pmc(int counter)
380{
381 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
382 return xen_amd_read_pmc(counter);
383 else
384 return xen_intel_read_pmc(counter);
385}
386
387int pmu_apic_update(uint32_t val)
388{
389 int ret;
390 struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
391
392 if (!xenpmu_data) {
393 pr_warn_once("%s: pmudata not initialized\n", __func__);
394 return -EINVAL;
395 }
396
397 xenpmu_data->pmu.l.lapic_lvtpc = val;
398
399 if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING)
400 return 0;
401
402 ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL);
403
404 return ret;
405}
406
407/* perf callbacks */
408static int xen_is_in_guest(void)
409{
410 const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
411
412 if (!xenpmu_data) {
413 pr_warn_once("%s: pmudata not initialized\n", __func__);
414 return 0;
415 }
416
417 if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
418 return 0;
419
420 return 1;
421}
422
423static int xen_is_user_mode(void)
424{
425 const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
426
427 if (!xenpmu_data) {
428 pr_warn_once("%s: pmudata not initialized\n", __func__);
429 return 0;
430 }
431
432 if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
433 return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
434 else
435 return !!(xenpmu_data->pmu.r.regs.cpl & 3);
436}
437
438static unsigned long xen_get_guest_ip(void)
439{
440 const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
441
442 if (!xenpmu_data) {
443 pr_warn_once("%s: pmudata not initialized\n", __func__);
444 return 0;
445 }
446
447 return xenpmu_data->pmu.r.regs.ip;
448}
449
450static struct perf_guest_info_callbacks xen_guest_cbs = {
451 .is_in_guest = xen_is_in_guest,
452 .is_user_mode = xen_is_user_mode,
453 .get_guest_ip = xen_get_guest_ip,
454};
455
456/* Convert registers from Xen's format to Linux' */
457static void xen_convert_regs(const struct xen_pmu_regs *xen_regs,
458 struct pt_regs *regs, uint64_t pmu_flags)
459{
460 regs->ip = xen_regs->ip;
461 regs->cs = xen_regs->cs;
462 regs->sp = xen_regs->sp;
463
464 if (pmu_flags & PMU_SAMPLE_PV) {
465 if (pmu_flags & PMU_SAMPLE_USER)
466 regs->cs |= 3;
467 else
468 regs->cs &= ~3;
469 } else {
470 if (xen_regs->cpl)
471 regs->cs |= 3;
472 else
473 regs->cs &= ~3;
474 }
475}
476
477irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
478{
479 int err, ret = IRQ_NONE;
480 struct pt_regs regs;
481 const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
482 uint8_t xenpmu_flags = get_xenpmu_flags();
483
484 if (!xenpmu_data) {
485 pr_warn_once("%s: pmudata not initialized\n", __func__);
486 return ret;
487 }
488
489 this_cpu_ptr(&xenpmu_shared)->flags =
490 xenpmu_flags | XENPMU_IRQ_PROCESSING;
491 xen_convert_regs(&xenpmu_data->pmu.r.regs, &regs,
492 xenpmu_data->pmu.pmu_flags);
493 if (x86_pmu.handle_irq(&regs))
494 ret = IRQ_HANDLED;
495
496 /* Write out cached context to HW */
497 err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL);
498 this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags;
499 if (err) {
500 pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err);
501 return IRQ_NONE;
502 }
503
504 return ret;
505}
506
507bool is_xen_pmu(int cpu)
508{
509 return (get_xenpmu_data() != NULL);
510}
511
512void xen_pmu_init(int cpu)
513{
514 int err;
515 struct xen_pmu_params xp;
516 unsigned long pfn;
517 struct xen_pmu_data *xenpmu_data;
518
519 BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE);
520
521 if (xen_hvm_domain())
522 return;
523
524 xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL);
525 if (!xenpmu_data) {
526 pr_err("VPMU init: No memory\n");
527 return;
528 }
529 pfn = virt_to_pfn(xenpmu_data);
530
531 xp.val = pfn_to_mfn(pfn);
532 xp.vcpu = cpu;
533 xp.version.maj = XENPMU_VER_MAJ;
534 xp.version.min = XENPMU_VER_MIN;
535 err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp);
536 if (err)
537 goto fail;
538
539 per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
540 per_cpu(xenpmu_shared, cpu).flags = 0;
541
542 if (cpu == 0) {
543 perf_register_guest_info_callbacks(&xen_guest_cbs);
544 xen_pmu_arch_init();
545 }
546
547 return;
548
549fail:
550 pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n",
551 cpu, err);
552 free_pages((unsigned long)xenpmu_data, 0);
553}
554
555void xen_pmu_finish(int cpu)
556{
557 struct xen_pmu_params xp;
558
559 if (xen_hvm_domain())
560 return;
561
562 xp.vcpu = cpu;
563 xp.version.maj = XENPMU_VER_MAJ;
564 xp.version.min = XENPMU_VER_MIN;
565
566 (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp);
567
568 free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0);
569 per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL;
570}
diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h
new file mode 100644
index 000000000000..af5f0ad94078
--- /dev/null
+++ b/arch/x86/xen/pmu.h
@@ -0,0 +1,15 @@
1#ifndef __XEN_PMU_H
2#define __XEN_PMU_H
3
4#include <xen/interface/xenpmu.h>
5
6irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
7void xen_pmu_init(int cpu);
8void xen_pmu_finish(int cpu);
9bool is_xen_pmu(int cpu);
10bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
11bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
12int pmu_apic_update(uint32_t reg);
13unsigned long long xen_read_pmc(int counter);
14
15#endif /* __XEN_PMU_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 55f388ef481a..f5ef6746d47a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -27,17 +27,23 @@
27#include <xen/interface/memory.h> 27#include <xen/interface/memory.h>
28#include <xen/interface/physdev.h> 28#include <xen/interface/physdev.h>
29#include <xen/features.h> 29#include <xen/features.h>
30#include <xen/hvc-console.h>
30#include "xen-ops.h" 31#include "xen-ops.h"
31#include "vdso.h" 32#include "vdso.h"
32#include "p2m.h"
33#include "mmu.h" 33#include "mmu.h"
34 34
35#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
36
35/* Amount of extra memory space we add to the e820 ranges */ 37/* Amount of extra memory space we add to the e820 ranges */
36struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; 38struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
37 39
38/* Number of pages released from the initial allocation. */ 40/* Number of pages released from the initial allocation. */
39unsigned long xen_released_pages; 41unsigned long xen_released_pages;
40 42
43/* E820 map used during setting up memory. */
44static struct e820entry xen_e820_map[E820MAX] __initdata;
45static u32 xen_e820_map_entries __initdata;
46
41/* 47/*
42 * Buffer used to remap identity mapped pages. We only need the virtual space. 48 * Buffer used to remap identity mapped pages. We only need the virtual space.
43 * The physical page behind this address is remapped as needed to different 49 * The physical page behind this address is remapped as needed to different
@@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
64 */ 70 */
65#define EXTRA_MEM_RATIO (10) 71#define EXTRA_MEM_RATIO (10)
66 72
67static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size) 73static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
74
75static void __init xen_parse_512gb(void)
76{
77 bool val = false;
78 char *arg;
79
80 arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
81 if (!arg)
82 return;
83
84 arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
85 if (!arg)
86 val = true;
87 else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
88 return;
89
90 xen_512gb_limit = val;
91}
92
93static void __init xen_add_extra_mem(unsigned long start_pfn,
94 unsigned long n_pfns)
68{ 95{
69 int i; 96 int i;
70 97
98 /*
99 * No need to check for zero size, should happen rarely and will only
100 * write a new entry regarded to be unused due to zero size.
101 */
71 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { 102 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
72 /* Add new region. */ 103 /* Add new region. */
73 if (xen_extra_mem[i].size == 0) { 104 if (xen_extra_mem[i].n_pfns == 0) {
74 xen_extra_mem[i].start = start; 105 xen_extra_mem[i].start_pfn = start_pfn;
75 xen_extra_mem[i].size = size; 106 xen_extra_mem[i].n_pfns = n_pfns;
76 break; 107 break;
77 } 108 }
78 /* Append to existing region. */ 109 /* Append to existing region. */
79 if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { 110 if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
80 xen_extra_mem[i].size += size; 111 start_pfn) {
112 xen_extra_mem[i].n_pfns += n_pfns;
81 break; 113 break;
82 } 114 }
83 } 115 }
84 if (i == XEN_EXTRA_MEM_MAX_REGIONS) 116 if (i == XEN_EXTRA_MEM_MAX_REGIONS)
85 printk(KERN_WARNING "Warning: not enough extra memory regions\n"); 117 printk(KERN_WARNING "Warning: not enough extra memory regions\n");
86 118
87 memblock_reserve(start, size); 119 memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
88} 120}
89 121
90static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) 122static void __init xen_del_extra_mem(unsigned long start_pfn,
123 unsigned long n_pfns)
91{ 124{
92 int i; 125 int i;
93 phys_addr_t start_r, size_r; 126 unsigned long start_r, size_r;
94 127
95 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { 128 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
96 start_r = xen_extra_mem[i].start; 129 start_r = xen_extra_mem[i].start_pfn;
97 size_r = xen_extra_mem[i].size; 130 size_r = xen_extra_mem[i].n_pfns;
98 131
99 /* Start of region. */ 132 /* Start of region. */
100 if (start_r == start) { 133 if (start_r == start_pfn) {
101 BUG_ON(size > size_r); 134 BUG_ON(n_pfns > size_r);
102 xen_extra_mem[i].start += size; 135 xen_extra_mem[i].start_pfn += n_pfns;
103 xen_extra_mem[i].size -= size; 136 xen_extra_mem[i].n_pfns -= n_pfns;
104 break; 137 break;
105 } 138 }
106 /* End of region. */ 139 /* End of region. */
107 if (start_r + size_r == start + size) { 140 if (start_r + size_r == start_pfn + n_pfns) {
108 BUG_ON(size > size_r); 141 BUG_ON(n_pfns > size_r);
109 xen_extra_mem[i].size -= size; 142 xen_extra_mem[i].n_pfns -= n_pfns;
110 break; 143 break;
111 } 144 }
112 /* Mid of region. */ 145 /* Mid of region. */
113 if (start > start_r && start < start_r + size_r) { 146 if (start_pfn > start_r && start_pfn < start_r + size_r) {
114 BUG_ON(start + size > start_r + size_r); 147 BUG_ON(start_pfn + n_pfns > start_r + size_r);
115 xen_extra_mem[i].size = start - start_r; 148 xen_extra_mem[i].n_pfns = start_pfn - start_r;
116 /* Calling memblock_reserve() again is okay. */ 149 /* Calling memblock_reserve() again is okay. */
117 xen_add_extra_mem(start + size, start_r + size_r - 150 xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
118 (start + size)); 151 (start_pfn + n_pfns));
119 break; 152 break;
120 } 153 }
121 } 154 }
122 memblock_free(start, size); 155 memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
123} 156}
124 157
125/* 158/*
@@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
130unsigned long __ref xen_chk_extra_mem(unsigned long pfn) 163unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
131{ 164{
132 int i; 165 int i;
133 phys_addr_t addr = PFN_PHYS(pfn);
134 166
135 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { 167 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
136 if (addr >= xen_extra_mem[i].start && 168 if (pfn >= xen_extra_mem[i].start_pfn &&
137 addr < xen_extra_mem[i].start + xen_extra_mem[i].size) 169 pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
138 return INVALID_P2M_ENTRY; 170 return INVALID_P2M_ENTRY;
139 } 171 }
140 172
@@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void)
150 int i; 182 int i;
151 183
152 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { 184 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
153 if (!xen_extra_mem[i].size) 185 if (!xen_extra_mem[i].n_pfns)
154 continue; 186 continue;
155 pfn_s = PFN_DOWN(xen_extra_mem[i].start); 187 pfn_s = xen_extra_mem[i].start_pfn;
156 pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size); 188 pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
157 for (pfn = pfn_s; pfn < pfn_e; pfn++) 189 for (pfn = pfn_s; pfn < pfn_e; pfn++)
158 set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 190 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
159 } 191 }
@@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void)
164 * This function updates min_pfn with the pfn found and returns 196 * This function updates min_pfn with the pfn found and returns
165 * the size of that range or zero if not found. 197 * the size of that range or zero if not found.
166 */ 198 */
167static unsigned long __init xen_find_pfn_range( 199static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
168 const struct e820entry *list, size_t map_size,
169 unsigned long *min_pfn)
170{ 200{
171 const struct e820entry *entry; 201 const struct e820entry *entry = xen_e820_map;
172 unsigned int i; 202 unsigned int i;
173 unsigned long done = 0; 203 unsigned long done = 0;
174 204
175 for (i = 0, entry = list; i < map_size; i++, entry++) { 205 for (i = 0; i < xen_e820_map_entries; i++, entry++) {
176 unsigned long s_pfn; 206 unsigned long s_pfn;
177 unsigned long e_pfn; 207 unsigned long e_pfn;
178 208
@@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn)
221 * as a fallback if the remapping fails. 251 * as a fallback if the remapping fails.
222 */ 252 */
223static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, 253static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
224 unsigned long end_pfn, unsigned long nr_pages, unsigned long *released) 254 unsigned long end_pfn, unsigned long nr_pages)
225{ 255{
226 unsigned long pfn, end; 256 unsigned long pfn, end;
227 int ret; 257 int ret;
@@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
241 WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); 271 WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
242 272
243 if (ret == 1) { 273 if (ret == 1) {
244 (*released)++; 274 xen_released_pages++;
245 if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY)) 275 if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
246 break; 276 break;
247 } else 277 } else
@@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk(
356 * to Xen and not remapped. 386 * to Xen and not remapped.
357 */ 387 */
358static unsigned long __init xen_set_identity_and_remap_chunk( 388static unsigned long __init xen_set_identity_and_remap_chunk(
359 const struct e820entry *list, size_t map_size, unsigned long start_pfn, 389 unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
360 unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, 390 unsigned long remap_pfn)
361 unsigned long *released, unsigned long *remapped)
362{ 391{
363 unsigned long pfn; 392 unsigned long pfn;
364 unsigned long i = 0; 393 unsigned long i = 0;
@@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
379 if (cur_pfn + size > nr_pages) 408 if (cur_pfn + size > nr_pages)
380 size = nr_pages - cur_pfn; 409 size = nr_pages - cur_pfn;
381 410
382 remap_range_size = xen_find_pfn_range(list, map_size, 411 remap_range_size = xen_find_pfn_range(&remap_pfn);
383 &remap_pfn);
384 if (!remap_range_size) { 412 if (!remap_range_size) {
385 pr_warning("Unable to find available pfn range, not remapping identity pages\n"); 413 pr_warning("Unable to find available pfn range, not remapping identity pages\n");
386 xen_set_identity_and_release_chunk(cur_pfn, 414 xen_set_identity_and_release_chunk(cur_pfn,
387 cur_pfn + left, nr_pages, released); 415 cur_pfn + left, nr_pages);
388 break; 416 break;
389 } 417 }
390 /* Adjust size to fit in current e820 RAM region */ 418 /* Adjust size to fit in current e820 RAM region */
@@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
396 /* Update variables to reflect new mappings. */ 424 /* Update variables to reflect new mappings. */
397 i += size; 425 i += size;
398 remap_pfn += size; 426 remap_pfn += size;
399 *remapped += size;
400 } 427 }
401 428
402 /* 429 /*
@@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
411 return remap_pfn; 438 return remap_pfn;
412} 439}
413 440
414static void __init xen_set_identity_and_remap( 441static void __init xen_set_identity_and_remap(unsigned long nr_pages)
415 const struct e820entry *list, size_t map_size, unsigned long nr_pages,
416 unsigned long *released, unsigned long *remapped)
417{ 442{
418 phys_addr_t start = 0; 443 phys_addr_t start = 0;
419 unsigned long last_pfn = nr_pages; 444 unsigned long last_pfn = nr_pages;
420 const struct e820entry *entry; 445 const struct e820entry *entry = xen_e820_map;
421 unsigned long num_released = 0;
422 unsigned long num_remapped = 0;
423 int i; 446 int i;
424 447
425 /* 448 /*
@@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap(
433 * example) the DMI tables in a reserved region that begins on 456 * example) the DMI tables in a reserved region that begins on
434 * a non-page boundary. 457 * a non-page boundary.
435 */ 458 */
436 for (i = 0, entry = list; i < map_size; i++, entry++) { 459 for (i = 0; i < xen_e820_map_entries; i++, entry++) {
437 phys_addr_t end = entry->addr + entry->size; 460 phys_addr_t end = entry->addr + entry->size;
438 if (entry->type == E820_RAM || i == map_size - 1) { 461 if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) {
439 unsigned long start_pfn = PFN_DOWN(start); 462 unsigned long start_pfn = PFN_DOWN(start);
440 unsigned long end_pfn = PFN_UP(end); 463 unsigned long end_pfn = PFN_UP(end);
441 464
@@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap(
444 467
445 if (start_pfn < end_pfn) 468 if (start_pfn < end_pfn)
446 last_pfn = xen_set_identity_and_remap_chunk( 469 last_pfn = xen_set_identity_and_remap_chunk(
447 list, map_size, start_pfn, 470 start_pfn, end_pfn, nr_pages,
448 end_pfn, nr_pages, last_pfn, 471 last_pfn);
449 &num_released, &num_remapped);
450 start = end; 472 start = end;
451 } 473 }
452 } 474 }
453 475
454 *released = num_released; 476 pr_info("Released %ld page(s)\n", xen_released_pages);
455 *remapped = num_remapped;
456
457 pr_info("Released %ld page(s)\n", num_released);
458} 477}
459 478
460/* 479/*
@@ -494,7 +513,7 @@ void __init xen_remap_memory(void)
494 } else if (pfn_s + len == xen_remap_buf.target_pfn) { 513 } else if (pfn_s + len == xen_remap_buf.target_pfn) {
495 len += xen_remap_buf.size; 514 len += xen_remap_buf.size;
496 } else { 515 } else {
497 xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); 516 xen_del_extra_mem(pfn_s, len);
498 pfn_s = xen_remap_buf.target_pfn; 517 pfn_s = xen_remap_buf.target_pfn;
499 len = xen_remap_buf.size; 518 len = xen_remap_buf.size;
500 } 519 }
@@ -504,19 +523,36 @@ void __init xen_remap_memory(void)
504 } 523 }
505 524
506 if (pfn_s != ~0UL && len) 525 if (pfn_s != ~0UL && len)
507 xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); 526 xen_del_extra_mem(pfn_s, len);
508 527
509 set_pte_mfn(buf, mfn_save, PAGE_KERNEL); 528 set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
510 529
511 pr_info("Remapped %ld page(s)\n", remapped); 530 pr_info("Remapped %ld page(s)\n", remapped);
512} 531}
513 532
533static unsigned long __init xen_get_pages_limit(void)
534{
535 unsigned long limit;
536
537#ifdef CONFIG_X86_32
538 limit = GB(64) / PAGE_SIZE;
539#else
540 limit = MAXMEM / PAGE_SIZE;
541 if (!xen_initial_domain() && xen_512gb_limit)
542 limit = GB(512) / PAGE_SIZE;
543#endif
544 return limit;
545}
546
514static unsigned long __init xen_get_max_pages(void) 547static unsigned long __init xen_get_max_pages(void)
515{ 548{
516 unsigned long max_pages = MAX_DOMAIN_PAGES; 549 unsigned long max_pages, limit;
517 domid_t domid = DOMID_SELF; 550 domid_t domid = DOMID_SELF;
518 int ret; 551 int ret;
519 552
553 limit = xen_get_pages_limit();
554 max_pages = limit;
555
520 /* 556 /*
521 * For the initial domain we use the maximum reservation as 557 * For the initial domain we use the maximum reservation as
522 * the maximum page. 558 * the maximum page.
@@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void)
532 max_pages = ret; 568 max_pages = ret;
533 } 569 }
534 570
535 return min(max_pages, MAX_DOMAIN_PAGES); 571 return min(max_pages, limit);
536} 572}
537 573
538static void __init xen_align_and_add_e820_region(phys_addr_t start, 574static void __init xen_align_and_add_e820_region(phys_addr_t start,
@@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
549 e820_add_region(start, end - start, type); 585 e820_add_region(start, end - start, type);
550} 586}
551 587
552static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size) 588static void __init xen_ignore_unusable(void)
553{ 589{
554 struct e820entry *entry; 590 struct e820entry *entry = xen_e820_map;
555 unsigned int i; 591 unsigned int i;
556 592
557 for (i = 0, entry = list; i < map_size; i++, entry++) { 593 for (i = 0; i < xen_e820_map_entries; i++, entry++) {
558 if (entry->type == E820_UNUSABLE) 594 if (entry->type == E820_UNUSABLE)
559 entry->type = E820_RAM; 595 entry->type = E820_RAM;
560 } 596 }
561} 597}
562 598
599static unsigned long __init xen_count_remap_pages(unsigned long max_pfn)
600{
601 unsigned long extra = 0;
602 unsigned long start_pfn, end_pfn;
603 const struct e820entry *entry = xen_e820_map;
604 int i;
605
606 end_pfn = 0;
607 for (i = 0; i < xen_e820_map_entries; i++, entry++) {
608 start_pfn = PFN_DOWN(entry->addr);
609 /* Adjacent regions on non-page boundaries handling! */
610 end_pfn = min(end_pfn, start_pfn);
611
612 if (start_pfn >= max_pfn)
613 return extra + max_pfn - end_pfn;
614
615 /* Add any holes in map to result. */
616 extra += start_pfn - end_pfn;
617
618 end_pfn = PFN_UP(entry->addr + entry->size);
619 end_pfn = min(end_pfn, max_pfn);
620
621 if (entry->type != E820_RAM)
622 extra += end_pfn - start_pfn;
623 }
624
625 return extra;
626}
627
628bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
629{
630 struct e820entry *entry;
631 unsigned mapcnt;
632 phys_addr_t end;
633
634 if (!size)
635 return false;
636
637 end = start + size;
638 entry = xen_e820_map;
639
640 for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) {
641 if (entry->type == E820_RAM && entry->addr <= start &&
642 (entry->addr + entry->size) >= end)
643 return false;
644
645 entry++;
646 }
647
648 return true;
649}
650
651/*
652 * Find a free area in physical memory not yet reserved and compliant with
653 * E820 map.
654 * Used to relocate pre-allocated areas like initrd or p2m list which are in
655 * conflict with the to be used E820 map.
656 * In case no area is found, return 0. Otherwise return the physical address
657 * of the area which is already reserved for convenience.
658 */
659phys_addr_t __init xen_find_free_area(phys_addr_t size)
660{
661 unsigned mapcnt;
662 phys_addr_t addr, start;
663 struct e820entry *entry = xen_e820_map;
664
665 for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) {
666 if (entry->type != E820_RAM || entry->size < size)
667 continue;
668 start = entry->addr;
669 for (addr = start; addr < start + size; addr += PAGE_SIZE) {
670 if (!memblock_is_reserved(addr))
671 continue;
672 start = addr + PAGE_SIZE;
673 if (start + size > entry->addr + entry->size)
674 break;
675 }
676 if (addr >= start + size) {
677 memblock_reserve(start, size);
678 return start;
679 }
680 }
681
682 return 0;
683}
684
685/*
686 * Like memcpy, but with physical addresses for dest and src.
687 */
688static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
689 phys_addr_t n)
690{
691 phys_addr_t dest_off, src_off, dest_len, src_len, len;
692 void *from, *to;
693
694 while (n) {
695 dest_off = dest & ~PAGE_MASK;
696 src_off = src & ~PAGE_MASK;
697 dest_len = n;
698 if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
699 dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
700 src_len = n;
701 if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
702 src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
703 len = min(dest_len, src_len);
704 to = early_memremap(dest - dest_off, dest_len + dest_off);
705 from = early_memremap(src - src_off, src_len + src_off);
706 memcpy(to, from, len);
707 early_memunmap(to, dest_len + dest_off);
708 early_memunmap(from, src_len + src_off);
709 n -= len;
710 dest += len;
711 src += len;
712 }
713}
714
715/*
716 * Reserve Xen mfn_list.
717 */
718static void __init xen_reserve_xen_mfnlist(void)
719{
720 phys_addr_t start, size;
721
722 if (xen_start_info->mfn_list >= __START_KERNEL_map) {
723 start = __pa(xen_start_info->mfn_list);
724 size = PFN_ALIGN(xen_start_info->nr_pages *
725 sizeof(unsigned long));
726 } else {
727 start = PFN_PHYS(xen_start_info->first_p2m_pfn);
728 size = PFN_PHYS(xen_start_info->nr_p2m_frames);
729 }
730
731 if (!xen_is_e820_reserved(start, size)) {
732 memblock_reserve(start, size);
733 return;
734 }
735
736#ifdef CONFIG_X86_32
737 /*
738 * Relocating the p2m on 32 bit system to an arbitrary virtual address
739 * is not supported, so just give up.
740 */
741 xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
742 BUG();
743#else
744 xen_relocate_p2m();
745#endif
746}
747
563/** 748/**
564 * machine_specific_memory_setup - Hook for machine specific memory setup. 749 * machine_specific_memory_setup - Hook for machine specific memory setup.
565 **/ 750 **/
566char * __init xen_memory_setup(void) 751char * __init xen_memory_setup(void)
567{ 752{
568 static struct e820entry map[E820MAX] __initdata; 753 unsigned long max_pfn, pfn_s, n_pfns;
569 754 phys_addr_t mem_end, addr, size, chunk_size;
570 unsigned long max_pfn = xen_start_info->nr_pages; 755 u32 type;
571 phys_addr_t mem_end;
572 int rc; 756 int rc;
573 struct xen_memory_map memmap; 757 struct xen_memory_map memmap;
574 unsigned long max_pages; 758 unsigned long max_pages;
575 unsigned long extra_pages = 0; 759 unsigned long extra_pages = 0;
576 unsigned long remapped_pages;
577 int i; 760 int i;
578 int op; 761 int op;
579 762
580 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); 763 xen_parse_512gb();
764 max_pfn = xen_get_pages_limit();
765 max_pfn = min(max_pfn, xen_start_info->nr_pages);
581 mem_end = PFN_PHYS(max_pfn); 766 mem_end = PFN_PHYS(max_pfn);
582 767
583 memmap.nr_entries = E820MAX; 768 memmap.nr_entries = E820MAX;
584 set_xen_guest_handle(memmap.buffer, map); 769 set_xen_guest_handle(memmap.buffer, xen_e820_map);
585 770
586 op = xen_initial_domain() ? 771 op = xen_initial_domain() ?
587 XENMEM_machine_memory_map : 772 XENMEM_machine_memory_map :
@@ -590,15 +775,16 @@ char * __init xen_memory_setup(void)
590 if (rc == -ENOSYS) { 775 if (rc == -ENOSYS) {
591 BUG_ON(xen_initial_domain()); 776 BUG_ON(xen_initial_domain());
592 memmap.nr_entries = 1; 777 memmap.nr_entries = 1;
593 map[0].addr = 0ULL; 778 xen_e820_map[0].addr = 0ULL;
594 map[0].size = mem_end; 779 xen_e820_map[0].size = mem_end;
595 /* 8MB slack (to balance backend allocations). */ 780 /* 8MB slack (to balance backend allocations). */
596 map[0].size += 8ULL << 20; 781 xen_e820_map[0].size += 8ULL << 20;
597 map[0].type = E820_RAM; 782 xen_e820_map[0].type = E820_RAM;
598 rc = 0; 783 rc = 0;
599 } 784 }
600 BUG_ON(rc); 785 BUG_ON(rc);
601 BUG_ON(memmap.nr_entries == 0); 786 BUG_ON(memmap.nr_entries == 0);
787 xen_e820_map_entries = memmap.nr_entries;
602 788
603 /* 789 /*
604 * Xen won't allow a 1:1 mapping to be created to UNUSABLE 790 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
@@ -609,24 +795,19 @@ char * __init xen_memory_setup(void)
609 * a patch in the future. 795 * a patch in the future.
610 */ 796 */
611 if (xen_initial_domain()) 797 if (xen_initial_domain())
612 xen_ignore_unusable(map, memmap.nr_entries); 798 xen_ignore_unusable();
613 799
614 /* Make sure the Xen-supplied memory map is well-ordered. */ 800 /* Make sure the Xen-supplied memory map is well-ordered. */
615 sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); 801 sanitize_e820_map(xen_e820_map, xen_e820_map_entries,
802 &xen_e820_map_entries);
616 803
617 max_pages = xen_get_max_pages(); 804 max_pages = xen_get_max_pages();
618 if (max_pages > max_pfn)
619 extra_pages += max_pages - max_pfn;
620 805
621 /* 806 /* How many extra pages do we need due to remapping? */
622 * Set identity map on non-RAM pages and prepare remapping the 807 max_pages += xen_count_remap_pages(max_pfn);
623 * underlying RAM.
624 */
625 xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
626 &xen_released_pages, &remapped_pages);
627 808
628 extra_pages += xen_released_pages; 809 if (max_pages > max_pfn)
629 extra_pages += remapped_pages; 810 extra_pages += max_pages - max_pfn;
630 811
631 /* 812 /*
632 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO 813 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
@@ -635,46 +816,54 @@ char * __init xen_memory_setup(void)
635 * is limited to the max size of lowmem, so that it doesn't 816 * is limited to the max size of lowmem, so that it doesn't
636 * get completely filled. 817 * get completely filled.
637 * 818 *
819 * Make sure we have no memory above max_pages, as this area
820 * isn't handled by the p2m management.
821 *
638 * In principle there could be a problem in lowmem systems if 822 * In principle there could be a problem in lowmem systems if
639 * the initial memory is also very large with respect to 823 * the initial memory is also very large with respect to
640 * lowmem, but we won't try to deal with that here. 824 * lowmem, but we won't try to deal with that here.
641 */ 825 */
642 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), 826 extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
643 extra_pages); 827 extra_pages, max_pages - max_pfn);
644 i = 0; 828 i = 0;
645 while (i < memmap.nr_entries) { 829 addr = xen_e820_map[0].addr;
646 phys_addr_t addr = map[i].addr; 830 size = xen_e820_map[0].size;
647 phys_addr_t size = map[i].size; 831 while (i < xen_e820_map_entries) {
648 u32 type = map[i].type; 832 chunk_size = size;
833 type = xen_e820_map[i].type;
649 834
650 if (type == E820_RAM) { 835 if (type == E820_RAM) {
651 if (addr < mem_end) { 836 if (addr < mem_end) {
652 size = min(size, mem_end - addr); 837 chunk_size = min(size, mem_end - addr);
653 } else if (extra_pages) { 838 } else if (extra_pages) {
654 size = min(size, PFN_PHYS(extra_pages)); 839 chunk_size = min(size, PFN_PHYS(extra_pages));
655 extra_pages -= PFN_DOWN(size); 840 pfn_s = PFN_UP(addr);
656 xen_add_extra_mem(addr, size); 841 n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
657 xen_max_p2m_pfn = PFN_DOWN(addr + size); 842 extra_pages -= n_pfns;
843 xen_add_extra_mem(pfn_s, n_pfns);
844 xen_max_p2m_pfn = pfn_s + n_pfns;
658 } else 845 } else
659 type = E820_UNUSABLE; 846 type = E820_UNUSABLE;
660 } 847 }
661 848
662 xen_align_and_add_e820_region(addr, size, type); 849 xen_align_and_add_e820_region(addr, chunk_size, type);
663 850
664 map[i].addr += size; 851 addr += chunk_size;
665 map[i].size -= size; 852 size -= chunk_size;
666 if (map[i].size == 0) 853 if (size == 0) {
667 i++; 854 i++;
855 if (i < xen_e820_map_entries) {
856 addr = xen_e820_map[i].addr;
857 size = xen_e820_map[i].size;
858 }
859 }
668 } 860 }
669 861
670 /* 862 /*
671 * Set the rest as identity mapped, in case PCI BARs are 863 * Set the rest as identity mapped, in case PCI BARs are
672 * located here. 864 * located here.
673 *
674 * PFNs above MAX_P2M_PFN are considered identity mapped as
675 * well.
676 */ 865 */
677 set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); 866 set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
678 867
679 /* 868 /*
680 * In domU, the ISA region is normal, usable memory, but we 869 * In domU, the ISA region is normal, usable memory, but we
@@ -684,34 +873,53 @@ char * __init xen_memory_setup(void)
684 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, 873 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
685 E820_RESERVED); 874 E820_RESERVED);
686 875
876 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
877
687 /* 878 /*
688 * Reserve Xen bits: 879 * Check whether the kernel itself conflicts with the target E820 map.
689 * - mfn_list 880 * Failing now is better than running into weird problems later due
690 * - xen_start_info 881 * to relocating (and even reusing) pages with kernel text or data.
691 * See comment above "struct start_info" in <xen/interface/xen.h>
692 * We tried to make the the memblock_reserve more selective so
693 * that it would be clear what region is reserved. Sadly we ran
694 * in the problem wherein on a 64-bit hypervisor with a 32-bit
695 * initial domain, the pt_base has the cr3 value which is not
696 * neccessarily where the pagetable starts! As Jan put it: "
697 * Actually, the adjustment turns out to be correct: The page
698 * tables for a 32-on-64 dom0 get allocated in the order "first L1",
699 * "first L2", "first L3", so the offset to the page table base is
700 * indeed 2. When reading xen/include/public/xen.h's comment
701 * very strictly, this is not a violation (since there nothing is said
702 * that the first thing in the page table space is pointed to by
703 * pt_base; I admit that this seems to be implied though, namely
704 * do I think that it is implied that the page table space is the
705 * range [pt_base, pt_base + nt_pt_frames), whereas that
706 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
707 * which - without a priori knowledge - the kernel would have
708 * difficulty to figure out)." - so lets just fall back to the
709 * easy way and reserve the whole region.
710 */ 882 */
711 memblock_reserve(__pa(xen_start_info->mfn_list), 883 if (xen_is_e820_reserved(__pa_symbol(_text),
712 xen_start_info->pt_base - xen_start_info->mfn_list); 884 __pa_symbol(__bss_stop) - __pa_symbol(_text))) {
885 xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
886 BUG();
887 }
713 888
714 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 889 /*
890 * Check for a conflict of the hypervisor supplied page tables with
891 * the target E820 map.
892 */
893 xen_pt_check_e820();
894
895 xen_reserve_xen_mfnlist();
896
897 /* Check for a conflict of the initrd with the target E820 map. */
898 if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
899 boot_params.hdr.ramdisk_size)) {
900 phys_addr_t new_area, start, size;
901
902 new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
903 if (!new_area) {
904 xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
905 BUG();
906 }
907
908 start = boot_params.hdr.ramdisk_image;
909 size = boot_params.hdr.ramdisk_size;
910 xen_phys_memcpy(new_area, start, size);
911 pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
912 start, start + size, new_area, new_area + size);
913 memblock_free(start, size);
914 boot_params.hdr.ramdisk_image = new_area;
915 boot_params.ext_ramdisk_image = new_area >> 32;
916 }
917
918 /*
919 * Set identity map on non-RAM pages and prepare remapping the
920 * underlying RAM.
921 */
922 xen_set_identity_and_remap(max_pfn);
715 923
716 return "Xen"; 924 return "Xen";
717} 925}
@@ -721,26 +929,30 @@ char * __init xen_memory_setup(void)
721 */ 929 */
722char * __init xen_auto_xlated_memory_setup(void) 930char * __init xen_auto_xlated_memory_setup(void)
723{ 931{
724 static struct e820entry map[E820MAX] __initdata;
725
726 struct xen_memory_map memmap; 932 struct xen_memory_map memmap;
727 int i; 933 int i;
728 int rc; 934 int rc;
729 935
730 memmap.nr_entries = E820MAX; 936 memmap.nr_entries = E820MAX;
731 set_xen_guest_handle(memmap.buffer, map); 937 set_xen_guest_handle(memmap.buffer, xen_e820_map);
732 938
733 rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); 939 rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
734 if (rc < 0) 940 if (rc < 0)
735 panic("No memory map (%d)\n", rc); 941 panic("No memory map (%d)\n", rc);
736 942
737 sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries); 943 xen_e820_map_entries = memmap.nr_entries;
944
945 sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
946 &xen_e820_map_entries);
738 947
739 for (i = 0; i < memmap.nr_entries; i++) 948 for (i = 0; i < xen_e820_map_entries; i++)
740 e820_add_region(map[i].addr, map[i].size, map[i].type); 949 e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
950 xen_e820_map[i].type);
741 951
742 memblock_reserve(__pa(xen_start_info->mfn_list), 952 /* Remove p2m info, it is not needed. */
743 xen_start_info->pt_base - xen_start_info->mfn_list); 953 xen_start_info->mfn_list = 0;
954 xen_start_info->first_p2m_pfn = 0;
955 xen_start_info->nr_p2m_frames = 0;
744 956
745 return "Xen"; 957 return "Xen";
746} 958}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 86484384492e..2a9ff7342791 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -26,6 +26,7 @@
26 26
27#include <xen/interface/xen.h> 27#include <xen/interface/xen.h>
28#include <xen/interface/vcpu.h> 28#include <xen/interface/vcpu.h>
29#include <xen/interface/xenpmu.h>
29 30
30#include <asm/xen/interface.h> 31#include <asm/xen/interface.h>
31#include <asm/xen/hypercall.h> 32#include <asm/xen/hypercall.h>
@@ -38,6 +39,7 @@
38#include "xen-ops.h" 39#include "xen-ops.h"
39#include "mmu.h" 40#include "mmu.h"
40#include "smp.h" 41#include "smp.h"
42#include "pmu.h"
41 43
42cpumask_var_t xen_cpu_initialized_map; 44cpumask_var_t xen_cpu_initialized_map;
43 45
@@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
50static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; 52static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
51static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; 53static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
52static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 }; 54static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
55static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
53 56
54static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 57static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
55static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 58static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
@@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu)
148 kfree(per_cpu(xen_irq_work, cpu).name); 151 kfree(per_cpu(xen_irq_work, cpu).name);
149 per_cpu(xen_irq_work, cpu).name = NULL; 152 per_cpu(xen_irq_work, cpu).name = NULL;
150 } 153 }
154
155 if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
156 unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
157 per_cpu(xen_pmu_irq, cpu).irq = -1;
158 kfree(per_cpu(xen_pmu_irq, cpu).name);
159 per_cpu(xen_pmu_irq, cpu).name = NULL;
160 }
151}; 161};
152static int xen_smp_intr_init(unsigned int cpu) 162static int xen_smp_intr_init(unsigned int cpu)
153{ 163{
154 int rc; 164 int rc;
155 char *resched_name, *callfunc_name, *debug_name; 165 char *resched_name, *callfunc_name, *debug_name, *pmu_name;
156 166
157 resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); 167 resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
158 rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, 168 rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu)
218 per_cpu(xen_irq_work, cpu).irq = rc; 228 per_cpu(xen_irq_work, cpu).irq = rc;
219 per_cpu(xen_irq_work, cpu).name = callfunc_name; 229 per_cpu(xen_irq_work, cpu).name = callfunc_name;
220 230
231 if (is_xen_pmu(cpu)) {
232 pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
233 rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
234 xen_pmu_irq_handler,
235 IRQF_PERCPU|IRQF_NOBALANCING,
236 pmu_name, NULL);
237 if (rc < 0)
238 goto fail;
239 per_cpu(xen_pmu_irq, cpu).irq = rc;
240 per_cpu(xen_pmu_irq, cpu).name = pmu_name;
241 }
242
221 return 0; 243 return 0;
222 244
223 fail: 245 fail:
@@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
335 } 357 }
336 set_cpu_sibling_map(0); 358 set_cpu_sibling_map(0);
337 359
360 xen_pmu_init(0);
361
338 if (xen_smp_intr_init(0)) 362 if (xen_smp_intr_init(0))
339 BUG(); 363 BUG();
340 364
@@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
462 if (rc) 486 if (rc)
463 return rc; 487 return rc;
464 488
489 xen_pmu_init(cpu);
490
465 rc = xen_smp_intr_init(cpu); 491 rc = xen_smp_intr_init(cpu);
466 if (rc) 492 if (rc)
467 return rc; 493 return rc;
@@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu)
503 xen_smp_intr_free(cpu); 529 xen_smp_intr_free(cpu);
504 xen_uninit_lock_cpu(cpu); 530 xen_uninit_lock_cpu(cpu);
505 xen_teardown_timer(cpu); 531 xen_teardown_timer(cpu);
532 xen_pmu_finish(cpu);
506 } 533 }
507} 534}
508 535
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 53b4c0811f4f..feddabdab448 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -11,6 +11,7 @@
11 11
12#include "xen-ops.h" 12#include "xen-ops.h"
13#include "mmu.h" 13#include "mmu.h"
14#include "pmu.h"
14 15
15static void xen_pv_pre_suspend(void) 16static void xen_pv_pre_suspend(void)
16{ 17{
@@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled)
67 68
68void xen_arch_pre_suspend(void) 69void xen_arch_pre_suspend(void)
69{ 70{
70 if (xen_pv_domain()) 71 int cpu;
71 xen_pv_pre_suspend(); 72
73 for_each_online_cpu(cpu)
74 xen_pmu_finish(cpu);
75
76 if (xen_pv_domain())
77 xen_pv_pre_suspend();
72} 78}
73 79
74void xen_arch_post_suspend(int cancelled) 80void xen_arch_post_suspend(int cancelled)
75{ 81{
76 if (xen_pv_domain()) 82 int cpu;
77 xen_pv_post_suspend(cancelled); 83
78 else 84 if (xen_pv_domain())
79 xen_hvm_post_suspend(cancelled); 85 xen_pv_post_suspend(cancelled);
86 else
87 xen_hvm_post_suspend(cancelled);
88
89 for_each_online_cpu(cpu)
90 xen_pmu_init(cpu);
80} 91}
81 92
82static void xen_vcpu_notify_restore(void *data) 93static void xen_vcpu_notify_restore(void *data)
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 8afdfccf6086..b65f59a358a2 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -104,6 +104,8 @@ ENTRY(hypercall_page)
104 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) 104 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
105#else 105#else
106 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) 106 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
107 /* Map the p2m table to a 512GB-aligned user address. */
108 ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE)
107#endif 109#endif
108 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) 110 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
109 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) 111 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 2292721b1d10..1399423f3418 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void);
35void xen_setup_machphys_mapping(void); 35void xen_setup_machphys_mapping(void);
36void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); 36void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
37void xen_reserve_top(void); 37void xen_reserve_top(void);
38void __init xen_reserve_special_pages(void);
39void __init xen_pt_check_e820(void);
38 40
39void xen_mm_pin_all(void); 41void xen_mm_pin_all(void);
40void xen_mm_unpin_all(void); 42void xen_mm_unpin_all(void);
43#ifdef CONFIG_X86_64
44void __init xen_relocate_p2m(void);
45#endif
41 46
47bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size);
42unsigned long __ref xen_chk_extra_mem(unsigned long pfn); 48unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
43void __init xen_inv_extra_mem(void); 49void __init xen_inv_extra_mem(void);
44void __init xen_remap_memory(void); 50void __init xen_remap_memory(void);
51phys_addr_t __init xen_find_free_area(phys_addr_t size);
45char * __init xen_memory_setup(void); 52char * __init xen_memory_setup(void);
46char * xen_auto_xlated_memory_setup(void); 53char * xen_auto_xlated_memory_setup(void);
47void __init xen_arch_setup(void); 54void __init xen_arch_setup(void);