aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
authorJeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>2010-11-16 14:06:22 -0500
committerJeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>2010-11-16 14:06:22 -0500
commit20b4755e4fbb226eb42951bd40b53fcbce9ef944 (patch)
tree43da70e0b32ee423d3643ecd422821383411ab72 /arch/x86/xen
parent744f9f104ea262de1dc3e29265870c649f0d9473 (diff)
parente53beacd23d9cb47590da6a7a7f6d417b941a994 (diff)
Merge commit 'v2.6.37-rc2' into upstream/xenfs
* commit 'v2.6.37-rc2': (10093 commits) Linux 2.6.37-rc2 capabilities/syslog: open code cap_syslog logic to fix build failure i2c: Sanity checks on adapter registration i2c: Mark i2c_adapter.id as deprecated i2c: Drivers shouldn't include <linux/i2c-id.h> i2c: Delete unused adapter IDs i2c: Remove obsolete cleanup for clientdata include/linux/kernel.h: Move logging bits to include/linux/printk.h Fix gcc 4.5.1 miscompiling drivers/char/i8k.c (again) hwmon: (w83795) Check for BEEP pin availability hwmon: (w83795) Clear intrusion alarm immediately hwmon: (w83795) Read the intrusion state properly hwmon: (w83795) Print the actual temperature channels as sources hwmon: (w83795) List all usable temperature sources hwmon: (w83795) Expose fan control method hwmon: (w83795) Fix fan control mode attributes hwmon: (lm95241) Check validity of input values hwmon: Change mail address of Hans J. Koch PCI: sysfs: fix printk warnings GFS2: Fix inode deallocation race ...
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig21
-rw-r--r--arch/x86/xen/debugfs.c1
-rw-r--r--arch/x86/xen/enlighten.c27
-rw-r--r--arch/x86/xen/mmu.c504
-rw-r--r--arch/x86/xen/mmu.h1
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c9
-rw-r--r--arch/x86/xen/setup.c134
-rw-r--r--arch/x86/xen/smp.c32
-rw-r--r--arch/x86/xen/spinlock.c2
-rw-r--r--arch/x86/xen/xen-ops.h3
10 files changed, 613 insertions, 121 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 68128a1b401..5b54892e4bc 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -13,21 +13,28 @@ config XEN
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
14 Xen hypervisor. 14 Xen hypervisor.
15 15
16config XEN_DOM0
17 def_bool y
18 depends on XEN && PCI_XEN && SWIOTLB_XEN
19 depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
20
21# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
22# name in tools.
23config XEN_PRIVILEGED_GUEST
24 def_bool XEN_DOM0
25
16config XEN_PVHVM 26config XEN_PVHVM
17 def_bool y 27 def_bool y
18 depends on XEN 28 depends on XEN
19 depends on X86_LOCAL_APIC 29 depends on X86_LOCAL_APIC
20 30
21config XEN_MAX_DOMAIN_MEMORY 31config XEN_MAX_DOMAIN_MEMORY
22 int "Maximum allowed size of a domain in gigabytes" 32 int
23 default 8 if X86_32 33 default 128
24 default 32 if X86_64
25 depends on XEN 34 depends on XEN
26 help 35 help
27 The pseudo-physical to machine address array is sized 36 This only affects the sizing of some bss arrays, the unused
28 according to the maximum possible memory size of a Xen 37 portions of which are freed.
29 domain. This array uses 1 page per gigabyte, so there's no
30 need to be too stingy here.
31 38
32config XEN_SAVE_RESTORE 39config XEN_SAVE_RESTORE
33 bool 40 bool
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 1304bcec8ee..7c0fedd98ea 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -106,6 +106,7 @@ static const struct file_operations u32_array_fops = {
106 .open = u32_array_open, 106 .open = u32_array_open,
107 .release= xen_array_release, 107 .release= xen_array_release,
108 .read = u32_array_read, 108 .read = u32_array_read,
109 .llseek = no_llseek,
109}; 110};
110 111
111struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, 112struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7d46c844141..235c0f4d386 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,6 +30,7 @@
30#include <linux/console.h> 30#include <linux/console.h>
31#include <linux/pci.h> 31#include <linux/pci.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/memblock.h>
33 34
34#include <xen/xen.h> 35#include <xen/xen.h>
35#include <xen/interface/xen.h> 36#include <xen/interface/xen.h>
@@ -45,6 +46,7 @@
45#include <asm/paravirt.h> 46#include <asm/paravirt.h>
46#include <asm/apic.h> 47#include <asm/apic.h>
47#include <asm/page.h> 48#include <asm/page.h>
49#include <asm/xen/pci.h>
48#include <asm/xen/hypercall.h> 50#include <asm/xen/hypercall.h>
49#include <asm/xen/hypervisor.h> 51#include <asm/xen/hypervisor.h>
50#include <asm/fixmap.h> 52#include <asm/fixmap.h>
@@ -58,7 +60,6 @@
58#include <asm/pgtable.h> 60#include <asm/pgtable.h>
59#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
60#include <asm/reboot.h> 62#include <asm/reboot.h>
61#include <asm/setup.h>
62#include <asm/stackprotector.h> 63#include <asm/stackprotector.h>
63#include <asm/hypervisor.h> 64#include <asm/hypervisor.h>
64 65
@@ -135,9 +136,6 @@ static void xen_vcpu_setup(int cpu)
135 info.mfn = arbitrary_virt_to_mfn(vcpup); 136 info.mfn = arbitrary_virt_to_mfn(vcpup);
136 info.offset = offset_in_page(vcpup); 137 info.offset = offset_in_page(vcpup);
137 138
138 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
139 cpu, vcpup, info.mfn, info.offset);
140
141 /* Check to see if the hypervisor will put the vcpu_info 139 /* Check to see if the hypervisor will put the vcpu_info
142 structure where we want it, which allows direct access via 140 structure where we want it, which allows direct access via
143 a percpu-variable. */ 141 a percpu-variable. */
@@ -151,9 +149,6 @@ static void xen_vcpu_setup(int cpu)
151 /* This cpu is using the registered vcpu info, even if 149 /* This cpu is using the registered vcpu info, even if
152 later ones fail to. */ 150 later ones fail to. */
153 per_cpu(xen_vcpu, cpu) = vcpup; 151 per_cpu(xen_vcpu, cpu) = vcpup;
154
155 printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
156 cpu, vcpup);
157 } 152 }
158} 153}
159 154
@@ -242,6 +237,7 @@ static __init void xen_init_cpuid_mask(void)
242 cpuid_leaf1_edx_mask = 237 cpuid_leaf1_edx_mask =
243 ~((1 << X86_FEATURE_MCE) | /* disable MCE */ 238 ~((1 << X86_FEATURE_MCE) | /* disable MCE */
244 (1 << X86_FEATURE_MCA) | /* disable MCA */ 239 (1 << X86_FEATURE_MCA) | /* disable MCA */
240 (1 << X86_FEATURE_MTRR) | /* disable MTRR */
245 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 241 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
246 242
247 if (!xen_initial_domain()) 243 if (!xen_initial_domain())
@@ -835,6 +831,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
835 Xen console noise. */ 831 Xen console noise. */
836 break; 832 break;
837 833
834 case MSR_IA32_CR_PAT:
835 if (smp_processor_id() == 0)
836 xen_set_pat(((u64)high << 32) | low);
837 break;
838
838 default: 839 default:
839 ret = native_write_msr_safe(msr, low, high); 840 ret = native_write_msr_safe(msr, low, high);
840 } 841 }
@@ -873,8 +874,6 @@ void xen_setup_vcpu_info_placement(void)
873 /* xen_vcpu_setup managed to place the vcpu_info within the 874 /* xen_vcpu_setup managed to place the vcpu_info within the
874 percpu area for all cpus, so make use of it */ 875 percpu area for all cpus, so make use of it */
875 if (have_vcpu_info_placement) { 876 if (have_vcpu_info_placement) {
876 printk(KERN_INFO "Xen: using vcpu_info placement\n");
877
878 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); 877 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
879 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); 878 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
880 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); 879 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -1018,7 +1017,7 @@ static void xen_reboot(int reason)
1018 struct sched_shutdown r = { .reason = reason }; 1017 struct sched_shutdown r = { .reason = reason };
1019 1018
1020#ifdef CONFIG_SMP 1019#ifdef CONFIG_SMP
1021 smp_send_stop(); 1020 stop_other_cpus();
1022#endif 1021#endif
1023 1022
1024 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) 1023 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
@@ -1183,8 +1182,14 @@ asmlinkage void __init xen_start_kernel(void)
1183 local_irq_disable(); 1182 local_irq_disable();
1184 early_boot_irqs_off(); 1183 early_boot_irqs_off();
1185 1184
1185 memblock_init();
1186
1186 xen_raw_console_write("mapping kernel into physical memory\n"); 1187 xen_raw_console_write("mapping kernel into physical memory\n");
1187 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); 1188 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1189 xen_ident_map_ISA();
1190
1191 /* Allocate and initialize top and mid mfn levels for p2m structure */
1192 xen_build_mfn_list_list();
1188 1193
1189 init_mm.pgd = pgd; 1194 init_mm.pgd = pgd;
1190 1195
@@ -1220,6 +1225,8 @@ asmlinkage void __init xen_start_kernel(void)
1220 add_preferred_console("xenboot", 0, NULL); 1225 add_preferred_console("xenboot", 0, NULL);
1221 add_preferred_console("tty", 0, NULL); 1226 add_preferred_console("tty", 0, NULL);
1222 add_preferred_console("hvc", 0, NULL); 1227 add_preferred_console("hvc", 0, NULL);
1228 if (pci_xen)
1229 x86_init.pci.arch_init = pci_xen_init;
1223 } else { 1230 } else {
1224 /* Make sure ACS will be enabled */ 1231 /* Make sure ACS will be enabled */
1225 pci_request_acs(); 1232 pci_request_acs();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 792de4349c7..276c67bba5a 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -45,6 +45,7 @@
45#include <linux/vmalloc.h> 45#include <linux/vmalloc.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/gfp.h> 47#include <linux/gfp.h>
48#include <linux/memblock.h>
48 49
49#include <asm/pgtable.h> 50#include <asm/pgtable.h>
50#include <asm/tlbflush.h> 51#include <asm/tlbflush.h>
@@ -55,6 +56,8 @@
55#include <asm/e820.h> 56#include <asm/e820.h>
56#include <asm/linkage.h> 57#include <asm/linkage.h>
57#include <asm/page.h> 58#include <asm/page.h>
59#include <asm/init.h>
60#include <asm/pat.h>
58 61
59#include <asm/xen/hypercall.h> 62#include <asm/xen/hypercall.h>
60#include <asm/xen/hypervisor.h> 63#include <asm/xen/hypervisor.h>
@@ -138,7 +141,8 @@ static inline void check_zero(void)
138 * large enough to allocate page table pages to allocate the rest. 141 * large enough to allocate page table pages to allocate the rest.
139 * Each page can map 2MB. 142 * Each page can map 2MB.
140 */ 143 */
141static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; 144#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
145static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
142 146
143#ifdef CONFIG_X86_64 147#ifdef CONFIG_X86_64
144/* l3 pud for userspace vsyscall mapping */ 148/* l3 pud for userspace vsyscall mapping */
@@ -169,49 +173,182 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
169 */ 173 */
170#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 174#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
171 175
176/*
177 * Xen leaves the responsibility for maintaining p2m mappings to the
178 * guests themselves, but it must also access and update the p2m array
179 * during suspend/resume when all the pages are reallocated.
180 *
181 * The p2m table is logically a flat array, but we implement it as a
182 * three-level tree to allow the address space to be sparse.
183 *
184 * Xen
185 * |
186 * p2m_top p2m_top_mfn
187 * / \ / \
188 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
189 * / \ / \ / /
190 * p2m p2m p2m p2m p2m p2m p2m ...
191 *
192 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
193 *
194 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
195 * maximum representable pseudo-physical address space is:
196 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
197 *
198 * P2M_PER_PAGE depends on the architecture, as a mfn is always
199 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
200 * 512 and 1024 entries respectively.
201 */
202
203unsigned long xen_max_p2m_pfn __read_mostly;
172 204
173#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) 205#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
174#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) 206#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
207#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
175 208
176/* Placeholder for holes in the address space */ 209#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
177static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
178 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
179 210
180 /* Array of pointers to pages containing p2m entries */ 211/* Placeholders for holes in the address space */
181static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = 212static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
182 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; 213static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
214static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
183 215
184/* Arrays of p2m arrays expressed in mfns used for save/restore */ 216static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
185static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; 217static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
218static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
186 219
187static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] 220RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
188 __page_aligned_bss; 221RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
189 222
190static inline unsigned p2m_top_index(unsigned long pfn) 223static inline unsigned p2m_top_index(unsigned long pfn)
191{ 224{
192 BUG_ON(pfn >= MAX_DOMAIN_PAGES); 225 BUG_ON(pfn >= MAX_P2M_PFN);
193 return pfn / P2M_ENTRIES_PER_PAGE; 226 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
227}
228
229static inline unsigned p2m_mid_index(unsigned long pfn)
230{
231 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
194} 232}
195 233
196static inline unsigned p2m_index(unsigned long pfn) 234static inline unsigned p2m_index(unsigned long pfn)
197{ 235{
198 return pfn % P2M_ENTRIES_PER_PAGE; 236 return pfn % P2M_PER_PAGE;
237}
238
239static void p2m_top_init(unsigned long ***top)
240{
241 unsigned i;
242
243 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
244 top[i] = p2m_mid_missing;
245}
246
247static void p2m_top_mfn_init(unsigned long *top)
248{
249 unsigned i;
250
251 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
252 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
253}
254
255static void p2m_top_mfn_p_init(unsigned long **top)
256{
257 unsigned i;
258
259 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
260 top[i] = p2m_mid_missing_mfn;
199} 261}
200 262
201/* Build the parallel p2m_top_mfn structures */ 263static void p2m_mid_init(unsigned long **mid)
264{
265 unsigned i;
266
267 for (i = 0; i < P2M_MID_PER_PAGE; i++)
268 mid[i] = p2m_missing;
269}
270
271static void p2m_mid_mfn_init(unsigned long *mid)
272{
273 unsigned i;
274
275 for (i = 0; i < P2M_MID_PER_PAGE; i++)
276 mid[i] = virt_to_mfn(p2m_missing);
277}
278
279static void p2m_init(unsigned long *p2m)
280{
281 unsigned i;
282
283 for (i = 0; i < P2M_MID_PER_PAGE; i++)
284 p2m[i] = INVALID_P2M_ENTRY;
285}
286
287/*
288 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
289 *
290 * This is called both at boot time, and after resuming from suspend:
291 * - At boot time we're called very early, and must use extend_brk()
292 * to allocate memory.
293 *
294 * - After resume we're called from within stop_machine, but the mfn
295 * tree should alreay be completely allocated.
296 */
202void xen_build_mfn_list_list(void) 297void xen_build_mfn_list_list(void)
203{ 298{
204 unsigned pfn, idx; 299 unsigned long pfn;
205 300
206 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { 301 /* Pre-initialize p2m_top_mfn to be completely missing */
207 unsigned topidx = p2m_top_index(pfn); 302 if (p2m_top_mfn == NULL) {
303 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
304 p2m_mid_mfn_init(p2m_mid_missing_mfn);
305
306 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
307 p2m_top_mfn_p_init(p2m_top_mfn_p);
208 308
209 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); 309 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
310 p2m_top_mfn_init(p2m_top_mfn);
311 } else {
312 /* Reinitialise, mfn's all change after migration */
313 p2m_mid_mfn_init(p2m_mid_missing_mfn);
210 } 314 }
211 315
212 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { 316 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
213 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; 317 unsigned topidx = p2m_top_index(pfn);
214 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); 318 unsigned mididx = p2m_mid_index(pfn);
319 unsigned long **mid;
320 unsigned long *mid_mfn_p;
321
322 mid = p2m_top[topidx];
323 mid_mfn_p = p2m_top_mfn_p[topidx];
324
325 /* Don't bother allocating any mfn mid levels if
326 * they're just missing, just update the stored mfn,
327 * since all could have changed over a migrate.
328 */
329 if (mid == p2m_mid_missing) {
330 BUG_ON(mididx);
331 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
332 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
333 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
334 continue;
335 }
336
337 if (mid_mfn_p == p2m_mid_missing_mfn) {
338 /*
339 * XXX boot-time only! We should never find
340 * missing parts of the mfn tree after
341 * runtime. extend_brk() will BUG if we call
342 * it too late.
343 */
344 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
345 p2m_mid_mfn_init(mid_mfn_p);
346
347 p2m_top_mfn_p[topidx] = mid_mfn_p;
348 }
349
350 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
351 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
215 } 352 }
216} 353}
217 354
@@ -220,8 +357,8 @@ void xen_setup_mfn_list_list(void)
220 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 357 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
221 358
222 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = 359 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
223 virt_to_mfn(p2m_top_mfn_list); 360 virt_to_mfn(p2m_top_mfn);
224 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages; 361 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
225} 362}
226 363
227/* Set up p2m_top to point to the domain-builder provided p2m pages */ 364/* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -229,98 +366,176 @@ void __init xen_build_dynamic_phys_to_machine(void)
229{ 366{
230 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; 367 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
231 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); 368 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
232 unsigned pfn; 369 unsigned long pfn;
370
371 xen_max_p2m_pfn = max_pfn;
372
373 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
374 p2m_init(p2m_missing);
375
376 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
377 p2m_mid_init(p2m_mid_missing);
378
379 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
380 p2m_top_init(p2m_top);
233 381
234 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { 382 /*
383 * The domain builder gives us a pre-constructed p2m array in
384 * mfn_list for all the pages initially given to us, so we just
385 * need to graft that into our tree structure.
386 */
387 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
235 unsigned topidx = p2m_top_index(pfn); 388 unsigned topidx = p2m_top_index(pfn);
389 unsigned mididx = p2m_mid_index(pfn);
236 390
237 p2m_top[topidx] = &mfn_list[pfn]; 391 if (p2m_top[topidx] == p2m_mid_missing) {
238 } 392 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
393 p2m_mid_init(mid);
239 394
240 xen_build_mfn_list_list(); 395 p2m_top[topidx] = mid;
396 }
397
398 p2m_top[topidx][mididx] = &mfn_list[pfn];
399 }
241} 400}
242 401
243unsigned long get_phys_to_machine(unsigned long pfn) 402unsigned long get_phys_to_machine(unsigned long pfn)
244{ 403{
245 unsigned topidx, idx; 404 unsigned topidx, mididx, idx;
246 405
247 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) 406 if (unlikely(pfn >= MAX_P2M_PFN))
248 return INVALID_P2M_ENTRY; 407 return INVALID_P2M_ENTRY;
249 408
250 topidx = p2m_top_index(pfn); 409 topidx = p2m_top_index(pfn);
410 mididx = p2m_mid_index(pfn);
251 idx = p2m_index(pfn); 411 idx = p2m_index(pfn);
252 return p2m_top[topidx][idx]; 412
413 return p2m_top[topidx][mididx][idx];
253} 414}
254EXPORT_SYMBOL_GPL(get_phys_to_machine); 415EXPORT_SYMBOL_GPL(get_phys_to_machine);
255 416
256/* install a new p2m_top page */ 417static void *alloc_p2m_page(void)
257bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
258{ 418{
259 unsigned topidx = p2m_top_index(pfn); 419 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
260 unsigned long **pfnp, *mfnp; 420}
261 unsigned i;
262 421
263 pfnp = &p2m_top[topidx]; 422static void free_p2m_page(void *p)
264 mfnp = &p2m_top_mfn[topidx]; 423{
424 free_page((unsigned long)p);
425}
265 426
266 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) 427/*
267 p[i] = INVALID_P2M_ENTRY; 428 * Fully allocate the p2m structure for a given pfn. We need to check
429 * that both the top and mid levels are allocated, and make sure the
430 * parallel mfn tree is kept in sync. We may race with other cpus, so
431 * the new pages are installed with cmpxchg; if we lose the race then
432 * simply free the page we allocated and use the one that's there.
433 */
434static bool alloc_p2m(unsigned long pfn)
435{
436 unsigned topidx, mididx;
437 unsigned long ***top_p, **mid;
438 unsigned long *top_mfn_p, *mid_mfn;
268 439
269 if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { 440 topidx = p2m_top_index(pfn);
270 *mfnp = virt_to_mfn(p); 441 mididx = p2m_mid_index(pfn);
271 return true; 442
443 top_p = &p2m_top[topidx];
444 mid = *top_p;
445
446 if (mid == p2m_mid_missing) {
447 /* Mid level is missing, allocate a new one */
448 mid = alloc_p2m_page();
449 if (!mid)
450 return false;
451
452 p2m_mid_init(mid);
453
454 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
455 free_p2m_page(mid);
272 } 456 }
273 457
274 return false; 458 top_mfn_p = &p2m_top_mfn[topidx];
275} 459 mid_mfn = p2m_top_mfn_p[topidx];
276 460
277static void alloc_p2m(unsigned long pfn) 461 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
278{
279 unsigned long *p;
280 462
281 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); 463 if (mid_mfn == p2m_mid_missing_mfn) {
282 BUG_ON(p == NULL); 464 /* Separately check the mid mfn level */
465 unsigned long missing_mfn;
466 unsigned long mid_mfn_mfn;
283 467
284 if (!install_p2mtop_page(pfn, p)) 468 mid_mfn = alloc_p2m_page();
285 free_page((unsigned long)p); 469 if (!mid_mfn)
470 return false;
471
472 p2m_mid_mfn_init(mid_mfn);
473
474 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
475 mid_mfn_mfn = virt_to_mfn(mid_mfn);
476 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
477 free_p2m_page(mid_mfn);
478 else
479 p2m_top_mfn_p[topidx] = mid_mfn;
480 }
481
482 if (p2m_top[topidx][mididx] == p2m_missing) {
483 /* p2m leaf page is missing */
484 unsigned long *p2m;
485
486 p2m = alloc_p2m_page();
487 if (!p2m)
488 return false;
489
490 p2m_init(p2m);
491
492 if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
493 free_p2m_page(p2m);
494 else
495 mid_mfn[mididx] = virt_to_mfn(p2m);
496 }
497
498 return true;
286} 499}
287 500
288/* Try to install p2m mapping; fail if intermediate bits missing */ 501/* Try to install p2m mapping; fail if intermediate bits missing */
289bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) 502bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
290{ 503{
291 unsigned topidx, idx; 504 unsigned topidx, mididx, idx;
292 505
293 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { 506 if (unlikely(pfn >= MAX_P2M_PFN)) {
294 BUG_ON(mfn != INVALID_P2M_ENTRY); 507 BUG_ON(mfn != INVALID_P2M_ENTRY);
295 return true; 508 return true;
296 } 509 }
297 510
298 topidx = p2m_top_index(pfn); 511 topidx = p2m_top_index(pfn);
299 if (p2m_top[topidx] == p2m_missing) { 512 mididx = p2m_mid_index(pfn);
300 if (mfn == INVALID_P2M_ENTRY)
301 return true;
302 return false;
303 }
304
305 idx = p2m_index(pfn); 513 idx = p2m_index(pfn);
306 p2m_top[topidx][idx] = mfn; 514
515 if (p2m_top[topidx][mididx] == p2m_missing)
516 return mfn == INVALID_P2M_ENTRY;
517
518 p2m_top[topidx][mididx][idx] = mfn;
307 519
308 return true; 520 return true;
309} 521}
310 522
311void set_phys_to_machine(unsigned long pfn, unsigned long mfn) 523bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
312{ 524{
313 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { 525 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
314 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); 526 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
315 return; 527 return true;
316 } 528 }
317 529
318 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 530 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
319 alloc_p2m(pfn); 531 if (!alloc_p2m(pfn))
532 return false;
320 533
321 if (!__set_phys_to_machine(pfn, mfn)) 534 if (!__set_phys_to_machine(pfn, mfn))
322 BUG(); 535 return false;
323 } 536 }
537
538 return true;
324} 539}
325 540
326unsigned long arbitrary_virt_to_mfn(void *vaddr) 541unsigned long arbitrary_virt_to_mfn(void *vaddr)
@@ -359,7 +574,8 @@ void make_lowmem_page_readonly(void *vaddr)
359 unsigned int level; 574 unsigned int level;
360 575
361 pte = lookup_address(address, &level); 576 pte = lookup_address(address, &level);
362 BUG_ON(pte == NULL); 577 if (pte == NULL)
578 return; /* vaddr missing */
363 579
364 ptev = pte_wrprotect(*pte); 580 ptev = pte_wrprotect(*pte);
365 581
@@ -374,7 +590,8 @@ void make_lowmem_page_readwrite(void *vaddr)
374 unsigned int level; 590 unsigned int level;
375 591
376 pte = lookup_address(address, &level); 592 pte = lookup_address(address, &level);
377 BUG_ON(pte == NULL); 593 if (pte == NULL)
594 return; /* vaddr missing */
378 595
379 ptev = pte_mkwrite(*pte); 596 ptev = pte_mkwrite(*pte);
380 597
@@ -563,7 +780,20 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
563 if (val & _PAGE_PRESENT) { 780 if (val & _PAGE_PRESENT) {
564 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 781 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
565 pteval_t flags = val & PTE_FLAGS_MASK; 782 pteval_t flags = val & PTE_FLAGS_MASK;
566 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; 783 unsigned long mfn = pfn_to_mfn(pfn);
784
785 /*
786 * If there's no mfn for the pfn, then just create an
787 * empty non-present pte. Unfortunately this loses
788 * information about the original pfn, so
789 * pte_mfn_to_pfn is asymmetric.
790 */
791 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
792 mfn = 0;
793 flags = 0;
794 }
795
796 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
567 } 797 }
568 798
569 return val; 799 return val;
@@ -585,10 +815,18 @@ static pteval_t iomap_pte(pteval_t val)
585 815
586pteval_t xen_pte_val(pte_t pte) 816pteval_t xen_pte_val(pte_t pte)
587{ 817{
588 if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) 818 pteval_t pteval = pte.pte;
589 return pte.pte; 819
820 /* If this is a WC pte, convert back from Xen WC to Linux WC */
821 if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
822 WARN_ON(!pat_enabled);
823 pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
824 }
825
826 if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
827 return pteval;
590 828
591 return pte_mfn_to_pfn(pte.pte); 829 return pte_mfn_to_pfn(pteval);
592} 830}
593PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); 831PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
594 832
@@ -598,10 +836,48 @@ pgdval_t xen_pgd_val(pgd_t pgd)
598} 836}
599PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); 837PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
600 838
839/*
840 * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
841 * are reserved for now, to correspond to the Intel-reserved PAT
842 * types.
843 *
844 * We expect Linux's PAT set as follows:
845 *
846 * Idx PTE flags Linux Xen Default
847 * 0 WB WB WB
848 * 1 PWT WC WT WT
849 * 2 PCD UC- UC- UC-
850 * 3 PCD PWT UC UC UC
851 * 4 PAT WB WC WB
852 * 5 PAT PWT WC WP WT
853 * 6 PAT PCD UC- UC UC-
854 * 7 PAT PCD PWT UC UC UC
855 */
856
857void xen_set_pat(u64 pat)
858{
859 /* We expect Linux to use a PAT setting of
860 * UC UC- WC WB (ignoring the PAT flag) */
861 WARN_ON(pat != 0x0007010600070106ull);
862}
863
601pte_t xen_make_pte(pteval_t pte) 864pte_t xen_make_pte(pteval_t pte)
602{ 865{
603 phys_addr_t addr = (pte & PTE_PFN_MASK); 866 phys_addr_t addr = (pte & PTE_PFN_MASK);
604 867
868 /* If Linux is trying to set a WC pte, then map to the Xen WC.
869 * If _PAGE_PAT is set, then it probably means it is really
870 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
871 * things work out OK...
872 *
873 * (We should never see kernel mappings with _PAGE_PSE set,
874 * but we could see hugetlbfs mappings, I think.).
875 */
876 if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
877 if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
878 pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
879 }
880
605 /* 881 /*
606 * Unprivileged domains are allowed to do IOMAPpings for 882 * Unprivileged domains are allowed to do IOMAPpings for
607 * PCI passthrough, but not map ISA space. The ISA 883 * PCI passthrough, but not map ISA space. The ISA
@@ -1514,13 +1790,25 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1514#endif 1790#endif
1515} 1791}
1516 1792
1517#ifdef CONFIG_X86_32
1518static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 1793static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1519{ 1794{
1795 unsigned long pfn = pte_pfn(pte);
1796
1797#ifdef CONFIG_X86_32
1520 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 1798 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1521 if (pte_val_ma(*ptep) & _PAGE_PRESENT) 1799 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1522 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & 1800 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1523 pte_val_ma(pte)); 1801 pte_val_ma(pte));
1802#endif
1803
1804 /*
1805 * If the new pfn is within the range of the newly allocated
1806 * kernel pagetable, and it isn't being mapped into an
1807 * early_ioremap fixmap slot, make sure it is RO.
1808 */
1809 if (!is_early_ioremap_ptep(ptep) &&
1810 pfn >= e820_table_start && pfn < e820_table_end)
1811 pte = pte_wrprotect(pte);
1524 1812
1525 return pte; 1813 return pte;
1526} 1814}
@@ -1533,7 +1821,6 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1533 1821
1534 xen_set_pte(ptep, pte); 1822 xen_set_pte(ptep, pte);
1535} 1823}
1536#endif
1537 1824
1538static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1825static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1539{ 1826{
@@ -1688,6 +1975,7 @@ static void *m2v(phys_addr_t maddr)
1688 return __ka(m2p(maddr)); 1975 return __ka(m2p(maddr));
1689} 1976}
1690 1977
1978/* Set the page permissions on an identity-mapped pages */
1691static void set_page_prot(void *addr, pgprot_t prot) 1979static void set_page_prot(void *addr, pgprot_t prot)
1692{ 1980{
1693 unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1981 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
@@ -1703,6 +1991,9 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1703 unsigned ident_pte; 1991 unsigned ident_pte;
1704 unsigned long pfn; 1992 unsigned long pfn;
1705 1993
1994 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1995 PAGE_SIZE);
1996
1706 ident_pte = 0; 1997 ident_pte = 0;
1707 pfn = 0; 1998 pfn = 0;
1708 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { 1999 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
@@ -1713,7 +2004,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1713 pte_page = m2v(pmd[pmdidx].pmd); 2004 pte_page = m2v(pmd[pmdidx].pmd);
1714 else { 2005 else {
1715 /* Check for free pte pages */ 2006 /* Check for free pte pages */
1716 if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) 2007 if (ident_pte == LEVEL1_IDENT_ENTRIES)
1717 break; 2008 break;
1718 2009
1719 pte_page = &level1_ident_pgt[ident_pte]; 2010 pte_page = &level1_ident_pgt[ident_pte];
@@ -1820,7 +2111,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1820 __xen_write_cr3(true, __pa(pgd)); 2111 __xen_write_cr3(true, __pa(pgd));
1821 xen_mc_issue(PARAVIRT_LAZY_CPU); 2112 xen_mc_issue(PARAVIRT_LAZY_CPU);
1822 2113
1823 reserve_early(__pa(xen_start_info->pt_base), 2114 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1824 __pa(xen_start_info->pt_base + 2115 __pa(xen_start_info->pt_base +
1825 xen_start_info->nr_pt_frames * PAGE_SIZE), 2116 xen_start_info->nr_pt_frames * PAGE_SIZE),
1826 "XEN PAGETABLES"); 2117 "XEN PAGETABLES");
@@ -1828,13 +2119,15 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1828 return pgd; 2119 return pgd;
1829} 2120}
1830#else /* !CONFIG_X86_64 */ 2121#else /* !CONFIG_X86_64 */
1831static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; 2122static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
1832 2123
1833__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, 2124__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1834 unsigned long max_pfn) 2125 unsigned long max_pfn)
1835{ 2126{
1836 pmd_t *kernel_pmd; 2127 pmd_t *kernel_pmd;
1837 2128
2129 level2_kernel_pgt = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2130
1838 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + 2131 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1839 xen_start_info->nr_pt_frames * PAGE_SIZE + 2132 xen_start_info->nr_pt_frames * PAGE_SIZE +
1840 512*1024); 2133 512*1024);
@@ -1858,7 +2151,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1858 2151
1859 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); 2152 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1860 2153
1861 reserve_early(__pa(xen_start_info->pt_base), 2154 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1862 __pa(xen_start_info->pt_base + 2155 __pa(xen_start_info->pt_base +
1863 xen_start_info->nr_pt_frames * PAGE_SIZE), 2156 xen_start_info->nr_pt_frames * PAGE_SIZE),
1864 "XEN PAGETABLES"); 2157 "XEN PAGETABLES");
@@ -1867,6 +2160,8 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1867} 2160}
1868#endif /* CONFIG_X86_64 */ 2161#endif /* CONFIG_X86_64 */
1869 2162
2163static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
2164
1870static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) 2165static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1871{ 2166{
1872 pte_t pte; 2167 pte_t pte;
@@ -1887,15 +2182,28 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1887#else 2182#else
1888 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: 2183 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1889#endif 2184#endif
1890#ifdef CONFIG_X86_LOCAL_APIC
1891 case FIX_APIC_BASE: /* maps dummy local APIC */
1892#endif
1893 case FIX_TEXT_POKE0: 2185 case FIX_TEXT_POKE0:
1894 case FIX_TEXT_POKE1: 2186 case FIX_TEXT_POKE1:
1895 /* All local page mappings */ 2187 /* All local page mappings */
1896 pte = pfn_pte(phys, prot); 2188 pte = pfn_pte(phys, prot);
1897 break; 2189 break;
1898 2190
2191#ifdef CONFIG_X86_LOCAL_APIC
2192 case FIX_APIC_BASE: /* maps dummy local APIC */
2193 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2194 break;
2195#endif
2196
2197#ifdef CONFIG_X86_IO_APIC
2198 case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
2199 /*
2200 * We just don't map the IO APIC - all access is via
2201 * hypercalls. Keep the address in the pte for reference.
2202 */
2203 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2204 break;
2205#endif
2206
1899 case FIX_PARAVIRT_BOOTMAP: 2207 case FIX_PARAVIRT_BOOTMAP:
1900 /* This is an MFN, but it isn't an IO mapping from the 2208 /* This is an MFN, but it isn't an IO mapping from the
1901 IO domain */ 2209 IO domain */
@@ -1920,6 +2228,29 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1920#endif 2228#endif
1921} 2229}
1922 2230
2231__init void xen_ident_map_ISA(void)
2232{
2233 unsigned long pa;
2234
2235 /*
2236 * If we're dom0, then linear map the ISA machine addresses into
2237 * the kernel's address space.
2238 */
2239 if (!xen_initial_domain())
2240 return;
2241
2242 xen_raw_printk("Xen: setup ISA identity maps\n");
2243
2244 for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
2245 pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
2246
2247 if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
2248 BUG();
2249 }
2250
2251 xen_flush_tlb();
2252}
2253
1923static __init void xen_post_allocator_init(void) 2254static __init void xen_post_allocator_init(void)
1924{ 2255{
1925 pv_mmu_ops.set_pte = xen_set_pte; 2256 pv_mmu_ops.set_pte = xen_set_pte;
@@ -1975,14 +2306,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1975 .alloc_pte = xen_alloc_pte_init, 2306 .alloc_pte = xen_alloc_pte_init,
1976 .release_pte = xen_release_pte_init, 2307 .release_pte = xen_release_pte_init,
1977 .alloc_pmd = xen_alloc_pmd_init, 2308 .alloc_pmd = xen_alloc_pmd_init,
1978 .alloc_pmd_clone = paravirt_nop,
1979 .release_pmd = xen_release_pmd_init, 2309 .release_pmd = xen_release_pmd_init,
1980 2310
1981#ifdef CONFIG_X86_64
1982 .set_pte = xen_set_pte,
1983#else
1984 .set_pte = xen_set_pte_init, 2311 .set_pte = xen_set_pte_init,
1985#endif
1986 .set_pte_at = xen_set_pte_at, 2312 .set_pte_at = xen_set_pte_at,
1987 .set_pmd = xen_set_pmd_hyper, 2313 .set_pmd = xen_set_pmd_hyper,
1988 2314
@@ -2033,6 +2359,8 @@ void __init xen_init_mmu_ops(void)
2033 pv_mmu_ops = xen_mmu_ops; 2359 pv_mmu_ops = xen_mmu_ops;
2034 2360
2035 vmap_lazy_unmap = false; 2361 vmap_lazy_unmap = false;
2362
2363 memset(dummy_mapping, 0xff, PAGE_SIZE);
2036} 2364}
2037 2365
2038/* Protected by xen_reservation_lock. */ 2366/* Protected by xen_reservation_lock. */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index fa938c4aa2f..537bb9aab77 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -12,7 +12,6 @@ enum pt_level {
12 12
13 13
14bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); 14bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
15bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
16 15
17void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 16void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
18 17
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index a013ec9d0c5..bfd0632fe65 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -1,10 +1,12 @@
1/* Glue code to lib/swiotlb-xen.c */ 1/* Glue code to lib/swiotlb-xen.c */
2 2
3#include <linux/dma-mapping.h> 3#include <linux/dma-mapping.h>
4#include <linux/pci.h>
4#include <xen/swiotlb-xen.h> 5#include <xen/swiotlb-xen.h>
5 6
6#include <asm/xen/hypervisor.h> 7#include <asm/xen/hypervisor.h>
7#include <xen/xen.h> 8#include <xen/xen.h>
9#include <asm/iommu_table.h>
8 10
9int xen_swiotlb __read_mostly; 11int xen_swiotlb __read_mostly;
10 12
@@ -54,5 +56,12 @@ void __init pci_xen_swiotlb_init(void)
54 if (xen_swiotlb) { 56 if (xen_swiotlb) {
55 xen_swiotlb_init(1); 57 xen_swiotlb_init(1);
56 dma_ops = &xen_swiotlb_dma_ops; 58 dma_ops = &xen_swiotlb_dma_ops;
59
60 /* Make sure ACS will be enabled */
61 pci_request_acs();
57 } 62 }
58} 63}
64IOMMU_INIT_FINISH(pci_xen_swiotlb_detect,
65 0,
66 pci_xen_swiotlb_init,
67 0);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 328b0030542..769c4b01fa3 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -8,6 +8,7 @@
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/memblock.h>
11 12
12#include <asm/elf.h> 13#include <asm/elf.h>
13#include <asm/vdso.h> 14#include <asm/vdso.h>
@@ -17,8 +18,10 @@
17#include <asm/xen/hypervisor.h> 18#include <asm/xen/hypervisor.h>
18#include <asm/xen/hypercall.h> 19#include <asm/xen/hypercall.h>
19 20
21#include <xen/xen.h>
20#include <xen/page.h> 22#include <xen/page.h>
21#include <xen/interface/callback.h> 23#include <xen/interface/callback.h>
24#include <xen/interface/memory.h>
22#include <xen/interface/physdev.h> 25#include <xen/interface/physdev.h>
23#include <xen/interface/memory.h> 26#include <xen/interface/memory.h>
24#include <xen/features.h> 27#include <xen/features.h>
@@ -33,6 +36,39 @@ extern void xen_sysenter_target(void);
33extern void xen_syscall_target(void); 36extern void xen_syscall_target(void);
34extern void xen_syscall32_target(void); 37extern void xen_syscall32_target(void);
35 38
39/* Amount of extra memory space we add to the e820 ranges */
40phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
41
42/*
43 * The maximum amount of extra memory compared to the base size. The
44 * main scaling factor is the size of struct page. At extreme ratios
45 * of base:extra, all the base memory can be filled with page
46 * structures for the extra memory, leaving no space for anything
47 * else.
48 *
49 * 10x seems like a reasonable balance between scaling flexibility and
50 * leaving a practically usable system.
51 */
52#define EXTRA_MEM_RATIO (10)
53
54static __init void xen_add_extra_mem(unsigned long pages)
55{
56 u64 size = (u64)pages * PAGE_SIZE;
57 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
58
59 if (!pages)
60 return;
61
62 e820_add_region(extra_start, size, E820_RAM);
63 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
64
65 memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA");
66
67 xen_extra_mem_size += size;
68
69 xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
70}
71
36static unsigned long __init xen_release_chunk(phys_addr_t start_addr, 72static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
37 phys_addr_t end_addr) 73 phys_addr_t end_addr)
38{ 74{
@@ -82,16 +118,18 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
82 const struct e820map *e820) 118 const struct e820map *e820)
83{ 119{
84 phys_addr_t max_addr = PFN_PHYS(max_pfn); 120 phys_addr_t max_addr = PFN_PHYS(max_pfn);
85 phys_addr_t last_end = 0; 121 phys_addr_t last_end = ISA_END_ADDRESS;
86 unsigned long released = 0; 122 unsigned long released = 0;
87 int i; 123 int i;
88 124
125 /* Free any unused memory above the low 1Mbyte. */
89 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { 126 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
90 phys_addr_t end = e820->map[i].addr; 127 phys_addr_t end = e820->map[i].addr;
91 end = min(max_addr, end); 128 end = min(max_addr, end);
92 129
93 released += xen_release_chunk(last_end, end); 130 if (last_end < end)
94 last_end = e820->map[i].addr + e820->map[i].size; 131 released += xen_release_chunk(last_end, end);
132 last_end = max(last_end, e820->map[i].addr + e820->map[i].size);
95 } 133 }
96 134
97 if (last_end < max_addr) 135 if (last_end < max_addr)
@@ -104,21 +142,75 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
104/** 142/**
105 * machine_specific_memory_setup - Hook for machine specific memory setup. 143 * machine_specific_memory_setup - Hook for machine specific memory setup.
106 **/ 144 **/
107
108char * __init xen_memory_setup(void) 145char * __init xen_memory_setup(void)
109{ 146{
147 static struct e820entry map[E820MAX] __initdata;
148
110 unsigned long max_pfn = xen_start_info->nr_pages; 149 unsigned long max_pfn = xen_start_info->nr_pages;
150 unsigned long long mem_end;
151 int rc;
152 struct xen_memory_map memmap;
153 unsigned long extra_pages = 0;
154 unsigned long extra_limit;
155 int i;
156 int op;
111 157
112 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); 158 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
159 mem_end = PFN_PHYS(max_pfn);
160
161 memmap.nr_entries = E820MAX;
162 set_xen_guest_handle(memmap.buffer, map);
163
164 op = xen_initial_domain() ?
165 XENMEM_machine_memory_map :
166 XENMEM_memory_map;
167 rc = HYPERVISOR_memory_op(op, &memmap);
168 if (rc == -ENOSYS) {
169 BUG_ON(xen_initial_domain());
170 memmap.nr_entries = 1;
171 map[0].addr = 0ULL;
172 map[0].size = mem_end;
173 /* 8MB slack (to balance backend allocations). */
174 map[0].size += 8ULL << 20;
175 map[0].type = E820_RAM;
176 rc = 0;
177 }
178 BUG_ON(rc);
113 179
114 e820.nr_map = 0; 180 e820.nr_map = 0;
181 xen_extra_mem_start = mem_end;
182 for (i = 0; i < memmap.nr_entries; i++) {
183 unsigned long long end = map[i].addr + map[i].size;
184
185 if (map[i].type == E820_RAM) {
186 if (map[i].addr < mem_end && end > mem_end) {
187 /* Truncate region to max_mem. */
188 u64 delta = end - mem_end;
115 189
116 e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); 190 map[i].size -= delta;
191 extra_pages += PFN_DOWN(delta);
192
193 end = mem_end;
194 }
195 }
196
197 if (end > xen_extra_mem_start)
198 xen_extra_mem_start = end;
199
200 /* If region is non-RAM or below mem_end, add what remains */
201 if ((map[i].type != E820_RAM || map[i].addr < mem_end) &&
202 map[i].size > 0)
203 e820_add_region(map[i].addr, map[i].size, map[i].type);
204 }
117 205
118 /* 206 /*
119 * Even though this is normal, usable memory under Xen, reserve 207 * In domU, the ISA region is normal, usable memory, but we
120 * ISA memory anyway because too many things think they can poke 208 * reserve ISA memory anyway because too many things poke
121 * about in there. 209 * about in there.
210 *
211 * In Dom0, the host E820 information can leave gaps in the
212 * ISA range, which would cause us to release those pages. To
213 * avoid this, we unconditionally reserve them here.
122 */ 214 */
123 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, 215 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
124 E820_RESERVED); 216 E820_RESERVED);
@@ -129,13 +221,35 @@ char * __init xen_memory_setup(void)
129 * - xen_start_info 221 * - xen_start_info
130 * See comment above "struct start_info" in <xen/interface/xen.h> 222 * See comment above "struct start_info" in <xen/interface/xen.h>
131 */ 223 */
132 reserve_early(__pa(xen_start_info->mfn_list), 224 memblock_x86_reserve_range(__pa(xen_start_info->mfn_list),
133 __pa(xen_start_info->pt_base), 225 __pa(xen_start_info->pt_base),
134 "XEN START INFO"); 226 "XEN START INFO");
135 227
136 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 228 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
137 229
138 xen_return_unused_memory(xen_start_info->nr_pages, &e820); 230 extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
231
232 /*
233 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
234 * factor the base size. On non-highmem systems, the base
235 * size is the full initial memory allocation; on highmem it
236 * is limited to the max size of lowmem, so that it doesn't
237 * get completely filled.
238 *
239 * In principle there could be a problem in lowmem systems if
240 * the initial memory is also very large with respect to
241 * lowmem, but we won't try to deal with that here.
242 */
243 extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
244 max_pfn + extra_pages);
245
246 if (extra_limit >= max_pfn)
247 extra_pages = extra_limit - max_pfn;
248 else
249 extra_pages = 0;
250
251 if (!xen_initial_domain())
252 xen_add_extra_mem(extra_pages);
139 253
140 return "Xen"; 254 return "Xen";
141} 255}
@@ -260,7 +374,5 @@ void __init xen_arch_setup(void)
260 374
261 pm_idle = xen_idle; 375 pm_idle = xen_idle;
262 376
263 paravirt_disable_iospace();
264
265 fiddle_vdso(); 377 fiddle_vdso();
266} 378}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 25f232b18a8..72a4c795904 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -28,6 +28,7 @@
28#include <asm/xen/interface.h> 28#include <asm/xen/interface.h>
29#include <asm/xen/hypercall.h> 29#include <asm/xen/hypercall.h>
30 30
31#include <xen/xen.h>
31#include <xen/page.h> 32#include <xen/page.h>
32#include <xen/events.h> 33#include <xen/events.h>
33 34
@@ -156,11 +157,35 @@ static void __init xen_fill_possible_map(void)
156{ 157{
157 int i, rc; 158 int i, rc;
158 159
160 if (xen_initial_domain())
161 return;
162
163 for (i = 0; i < nr_cpu_ids; i++) {
164 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
165 if (rc >= 0) {
166 num_processors++;
167 set_cpu_possible(i, true);
168 }
169 }
170}
171
172static void __init xen_filter_cpu_maps(void)
173{
174 int i, rc;
175
176 if (!xen_initial_domain())
177 return;
178
179 num_processors = 0;
180 disabled_cpus = 0;
159 for (i = 0; i < nr_cpu_ids; i++) { 181 for (i = 0; i < nr_cpu_ids; i++) {
160 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 182 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
161 if (rc >= 0) { 183 if (rc >= 0) {
162 num_processors++; 184 num_processors++;
163 set_cpu_possible(i, true); 185 set_cpu_possible(i, true);
186 } else {
187 set_cpu_possible(i, false);
188 set_cpu_present(i, false);
164 } 189 }
165 } 190 }
166} 191}
@@ -174,6 +199,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
174 old memory can be recycled */ 199 old memory can be recycled */
175 make_lowmem_page_readwrite(xen_initial_gdt); 200 make_lowmem_page_readwrite(xen_initial_gdt);
176 201
202 xen_filter_cpu_maps();
177 xen_setup_vcpu_info_placement(); 203 xen_setup_vcpu_info_placement();
178} 204}
179 205
@@ -400,9 +426,9 @@ static void stop_self(void *v)
400 BUG(); 426 BUG();
401} 427}
402 428
403static void xen_smp_send_stop(void) 429static void xen_stop_other_cpus(int wait)
404{ 430{
405 smp_call_function(stop_self, NULL, 0); 431 smp_call_function(stop_self, NULL, wait);
406} 432}
407 433
408static void xen_smp_send_reschedule(int cpu) 434static void xen_smp_send_reschedule(int cpu)
@@ -470,7 +496,7 @@ static const struct smp_ops xen_smp_ops __initdata = {
470 .cpu_disable = xen_cpu_disable, 496 .cpu_disable = xen_cpu_disable,
471 .play_dead = xen_play_dead, 497 .play_dead = xen_play_dead,
472 498
473 .smp_send_stop = xen_smp_send_stop, 499 .stop_other_cpus = xen_stop_other_cpus,
474 .smp_send_reschedule = xen_smp_send_reschedule, 500 .smp_send_reschedule = xen_smp_send_reschedule,
475 501
476 .send_call_func_ipi = xen_smp_send_call_function_ipi, 502 .send_call_func_ipi = xen_smp_send_call_function_ipi,
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index e0500646585..23e061b9327 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -224,7 +224,7 @@ static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enab
224 goto out; 224 goto out;
225 } 225 }
226 226
227 flags = __raw_local_save_flags(); 227 flags = arch_local_save_flags();
228 if (irq_enable) { 228 if (irq_enable) {
229 ADD_STATS(taken_slow_irqenable, 1); 229 ADD_STATS(taken_slow_irqenable, 1);
230 raw_local_irq_enable(); 230 raw_local_irq_enable();
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 7c8ab86163e..64044747348 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,6 +30,9 @@ void xen_setup_machphys_mapping(void);
30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); 30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
31void xen_ident_map_ISA(void); 31void xen_ident_map_ISA(void);
32void xen_reserve_top(void); 32void xen_reserve_top(void);
33extern unsigned long xen_max_p2m_pfn;
34
35void xen_set_pat(u64);
33 36
34char * __init xen_memory_setup(void); 37char * __init xen_memory_setup(void);
35void __init xen_arch_setup(void); 38void __init xen_arch_setup(void);