diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/Kconfig | 9 | ||||
-rw-r--r-- | arch/x86/ia32/ia32_aout.c | 6 | ||||
-rw-r--r-- | arch/x86/kernel/machine_kexec_32.c | 39 | ||||
-rw-r--r-- | arch/x86/kernel/machine_kexec_64.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/pci-calgary_64.c | 73 | ||||
-rw-r--r-- | arch/x86/kernel/pci-dma.c | 27 | ||||
-rw-r--r-- | arch/x86/kernel/pci-gart_64.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/pci-nommu.c | 14 | ||||
-rw-r--r-- | arch/x86/kernel/pci-swiotlb_64.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/relocate_kernel_32.S | 174 | ||||
-rw-r--r-- | arch/x86/mm/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/mm/gup.c | 295 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 37 | ||||
-rw-r--r-- | arch/x86/mm/pgtable_32.c | 47 |
14 files changed, 535 insertions, 194 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 39ae67985950..b6fa2877b173 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -22,6 +22,7 @@ config X86 | |||
22 | select HAVE_IDE | 22 | select HAVE_IDE |
23 | select HAVE_OPROFILE | 23 | select HAVE_OPROFILE |
24 | select HAVE_IOREMAP_PROT | 24 | select HAVE_IOREMAP_PROT |
25 | select HAVE_GET_USER_PAGES_FAST | ||
25 | select HAVE_KPROBES | 26 | select HAVE_KPROBES |
26 | select ARCH_WANT_OPTIONAL_GPIOLIB | 27 | select ARCH_WANT_OPTIONAL_GPIOLIB |
27 | select HAVE_KRETPROBES | 28 | select HAVE_KRETPROBES |
@@ -1275,6 +1276,14 @@ config CRASH_DUMP | |||
1275 | (CONFIG_RELOCATABLE=y). | 1276 | (CONFIG_RELOCATABLE=y). |
1276 | For more details see Documentation/kdump/kdump.txt | 1277 | For more details see Documentation/kdump/kdump.txt |
1277 | 1278 | ||
1279 | config KEXEC_JUMP | ||
1280 | bool "kexec jump (EXPERIMENTAL)" | ||
1281 | depends on EXPERIMENTAL | ||
1282 | depends on KEXEC && HIBERNATION && X86_32 | ||
1283 | help | ||
1284 | Jump between original kernel and kexeced kernel and invoke | ||
1285 | code in physical address mode via KEXEC | ||
1286 | |||
1278 | config PHYSICAL_START | 1287 | config PHYSICAL_START |
1279 | hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) | 1288 | hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) |
1280 | default "0x1000000" if X86_NUMAQ | 1289 | default "0x1000000" if X86_NUMAQ |
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 58cccb6483b0..a0e1dbe67dc1 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c | |||
@@ -441,12 +441,6 @@ beyond_if: | |||
441 | regs->r8 = regs->r9 = regs->r10 = regs->r11 = | 441 | regs->r8 = regs->r9 = regs->r10 = regs->r11 = |
442 | regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; | 442 | regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; |
443 | set_fs(USER_DS); | 443 | set_fs(USER_DS); |
444 | if (unlikely(current->ptrace & PT_PTRACED)) { | ||
445 | if (current->ptrace & PT_TRACE_EXEC) | ||
446 | ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP); | ||
447 | else | ||
448 | send_sig(SIGTRAP, current, 0); | ||
449 | } | ||
450 | return 0; | 444 | return 0; |
451 | } | 445 | } |
452 | 446 | ||
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8864230d55af..9fe478d98406 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <asm/cpufeature.h> | 22 | #include <asm/cpufeature.h> |
23 | #include <asm/desc.h> | 23 | #include <asm/desc.h> |
24 | #include <asm/system.h> | 24 | #include <asm/system.h> |
25 | #include <asm/cacheflush.h> | ||
25 | 26 | ||
26 | #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) | 27 | #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) |
27 | static u32 kexec_pgd[1024] PAGE_ALIGNED; | 28 | static u32 kexec_pgd[1024] PAGE_ALIGNED; |
@@ -85,10 +86,12 @@ static void load_segments(void) | |||
85 | * reboot code buffer to allow us to avoid allocations | 86 | * reboot code buffer to allow us to avoid allocations |
86 | * later. | 87 | * later. |
87 | * | 88 | * |
88 | * Currently nothing. | 89 | * Make control page executable. |
89 | */ | 90 | */ |
90 | int machine_kexec_prepare(struct kimage *image) | 91 | int machine_kexec_prepare(struct kimage *image) |
91 | { | 92 | { |
93 | if (nx_enabled) | ||
94 | set_pages_x(image->control_code_page, 1); | ||
92 | return 0; | 95 | return 0; |
93 | } | 96 | } |
94 | 97 | ||
@@ -98,27 +101,48 @@ int machine_kexec_prepare(struct kimage *image) | |||
98 | */ | 101 | */ |
99 | void machine_kexec_cleanup(struct kimage *image) | 102 | void machine_kexec_cleanup(struct kimage *image) |
100 | { | 103 | { |
104 | if (nx_enabled) | ||
105 | set_pages_nx(image->control_code_page, 1); | ||
101 | } | 106 | } |
102 | 107 | ||
103 | /* | 108 | /* |
104 | * Do not allocate memory (or fail in any way) in machine_kexec(). | 109 | * Do not allocate memory (or fail in any way) in machine_kexec(). |
105 | * We are past the point of no return, committed to rebooting now. | 110 | * We are past the point of no return, committed to rebooting now. |
106 | */ | 111 | */ |
107 | NORET_TYPE void machine_kexec(struct kimage *image) | 112 | void machine_kexec(struct kimage *image) |
108 | { | 113 | { |
109 | unsigned long page_list[PAGES_NR]; | 114 | unsigned long page_list[PAGES_NR]; |
110 | void *control_page; | 115 | void *control_page; |
116 | asmlinkage unsigned long | ||
117 | (*relocate_kernel_ptr)(unsigned long indirection_page, | ||
118 | unsigned long control_page, | ||
119 | unsigned long start_address, | ||
120 | unsigned int has_pae, | ||
121 | unsigned int preserve_context); | ||
111 | 122 | ||
112 | tracer_disable(); | 123 | tracer_disable(); |
113 | 124 | ||
114 | /* Interrupts aren't acceptable while we reboot */ | 125 | /* Interrupts aren't acceptable while we reboot */ |
115 | local_irq_disable(); | 126 | local_irq_disable(); |
116 | 127 | ||
128 | if (image->preserve_context) { | ||
129 | #ifdef CONFIG_X86_IO_APIC | ||
130 | /* We need to put APICs in legacy mode so that we can | ||
131 | * get timer interrupts in second kernel. kexec/kdump | ||
132 | * paths already have calls to disable_IO_APIC() in | ||
133 | * one form or other. kexec jump path also need | ||
134 | * one. | ||
135 | */ | ||
136 | disable_IO_APIC(); | ||
137 | #endif | ||
138 | } | ||
139 | |||
117 | control_page = page_address(image->control_code_page); | 140 | control_page = page_address(image->control_code_page); |
118 | memcpy(control_page, relocate_kernel, PAGE_SIZE); | 141 | memcpy(control_page, relocate_kernel, PAGE_SIZE/2); |
119 | 142 | ||
143 | relocate_kernel_ptr = control_page; | ||
120 | page_list[PA_CONTROL_PAGE] = __pa(control_page); | 144 | page_list[PA_CONTROL_PAGE] = __pa(control_page); |
121 | page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; | 145 | page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; |
122 | page_list[PA_PGD] = __pa(kexec_pgd); | 146 | page_list[PA_PGD] = __pa(kexec_pgd); |
123 | page_list[VA_PGD] = (unsigned long)kexec_pgd; | 147 | page_list[VA_PGD] = (unsigned long)kexec_pgd; |
124 | #ifdef CONFIG_X86_PAE | 148 | #ifdef CONFIG_X86_PAE |
@@ -131,6 +155,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) | |||
131 | page_list[VA_PTE_0] = (unsigned long)kexec_pte0; | 155 | page_list[VA_PTE_0] = (unsigned long)kexec_pte0; |
132 | page_list[PA_PTE_1] = __pa(kexec_pte1); | 156 | page_list[PA_PTE_1] = __pa(kexec_pte1); |
133 | page_list[VA_PTE_1] = (unsigned long)kexec_pte1; | 157 | page_list[VA_PTE_1] = (unsigned long)kexec_pte1; |
158 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); | ||
134 | 159 | ||
135 | /* The segment registers are funny things, they have both a | 160 | /* The segment registers are funny things, they have both a |
136 | * visible and an invisible part. Whenever the visible part is | 161 | * visible and an invisible part. Whenever the visible part is |
@@ -149,8 +174,10 @@ NORET_TYPE void machine_kexec(struct kimage *image) | |||
149 | set_idt(phys_to_virt(0),0); | 174 | set_idt(phys_to_virt(0),0); |
150 | 175 | ||
151 | /* now call it */ | 176 | /* now call it */ |
152 | relocate_kernel((unsigned long)image->head, (unsigned long)page_list, | 177 | image->start = relocate_kernel_ptr((unsigned long)image->head, |
153 | image->start, cpu_has_pae); | 178 | (unsigned long)page_list, |
179 | image->start, cpu_has_pae, | ||
180 | image->preserve_context); | ||
154 | } | 181 | } |
155 | 182 | ||
156 | void arch_crash_save_vmcoreinfo(void) | 183 | void arch_crash_save_vmcoreinfo(void) |
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 9dd9262693a3..c43caa3a91f3 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c | |||
@@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image) | |||
181 | * Do not allocate memory (or fail in any way) in machine_kexec(). | 181 | * Do not allocate memory (or fail in any way) in machine_kexec(). |
182 | * We are past the point of no return, committed to rebooting now. | 182 | * We are past the point of no return, committed to rebooting now. |
183 | */ | 183 | */ |
184 | NORET_TYPE void machine_kexec(struct kimage *image) | 184 | void machine_kexec(struct kimage *image) |
185 | { | 185 | { |
186 | unsigned long page_list[PAGES_NR]; | 186 | unsigned long page_list[PAGES_NR]; |
187 | void *control_page; | 187 | void *control_page; |
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 19e7fc7c2c4f..b67a4b1d4eae 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/delay.h> | 37 | #include <linux/delay.h> |
38 | #include <linux/scatterlist.h> | 38 | #include <linux/scatterlist.h> |
39 | #include <linux/iommu-helper.h> | 39 | #include <linux/iommu-helper.h> |
40 | |||
40 | #include <asm/iommu.h> | 41 | #include <asm/iommu.h> |
41 | #include <asm/calgary.h> | 42 | #include <asm/calgary.h> |
42 | #include <asm/tce.h> | 43 | #include <asm/tce.h> |
@@ -413,22 +414,6 @@ static void calgary_unmap_sg(struct device *dev, | |||
413 | } | 414 | } |
414 | } | 415 | } |
415 | 416 | ||
416 | static int calgary_nontranslate_map_sg(struct device* dev, | ||
417 | struct scatterlist *sg, int nelems, int direction) | ||
418 | { | ||
419 | struct scatterlist *s; | ||
420 | int i; | ||
421 | |||
422 | for_each_sg(sg, s, nelems, i) { | ||
423 | struct page *p = sg_page(s); | ||
424 | |||
425 | BUG_ON(!p); | ||
426 | s->dma_address = virt_to_bus(sg_virt(s)); | ||
427 | s->dma_length = s->length; | ||
428 | } | ||
429 | return nelems; | ||
430 | } | ||
431 | |||
432 | static int calgary_map_sg(struct device *dev, struct scatterlist *sg, | 417 | static int calgary_map_sg(struct device *dev, struct scatterlist *sg, |
433 | int nelems, int direction) | 418 | int nelems, int direction) |
434 | { | 419 | { |
@@ -439,9 +424,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, | |||
439 | unsigned long entry; | 424 | unsigned long entry; |
440 | int i; | 425 | int i; |
441 | 426 | ||
442 | if (!translation_enabled(tbl)) | ||
443 | return calgary_nontranslate_map_sg(dev, sg, nelems, direction); | ||
444 | |||
445 | for_each_sg(sg, s, nelems, i) { | 427 | for_each_sg(sg, s, nelems, i) { |
446 | BUG_ON(!sg_page(s)); | 428 | BUG_ON(!sg_page(s)); |
447 | 429 | ||
@@ -477,7 +459,6 @@ error: | |||
477 | static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, | 459 | static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, |
478 | size_t size, int direction) | 460 | size_t size, int direction) |
479 | { | 461 | { |
480 | dma_addr_t dma_handle = bad_dma_address; | ||
481 | void *vaddr = phys_to_virt(paddr); | 462 | void *vaddr = phys_to_virt(paddr); |
482 | unsigned long uaddr; | 463 | unsigned long uaddr; |
483 | unsigned int npages; | 464 | unsigned int npages; |
@@ -486,12 +467,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, | |||
486 | uaddr = (unsigned long)vaddr; | 467 | uaddr = (unsigned long)vaddr; |
487 | npages = num_dma_pages(uaddr, size); | 468 | npages = num_dma_pages(uaddr, size); |
488 | 469 | ||
489 | if (translation_enabled(tbl)) | 470 | return iommu_alloc(dev, tbl, vaddr, npages, direction); |
490 | dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction); | ||
491 | else | ||
492 | dma_handle = virt_to_bus(vaddr); | ||
493 | |||
494 | return dma_handle; | ||
495 | } | 471 | } |
496 | 472 | ||
497 | static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, | 473 | static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, |
@@ -500,9 +476,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, | |||
500 | struct iommu_table *tbl = find_iommu_table(dev); | 476 | struct iommu_table *tbl = find_iommu_table(dev); |
501 | unsigned int npages; | 477 | unsigned int npages; |
502 | 478 | ||
503 | if (!translation_enabled(tbl)) | ||
504 | return; | ||
505 | |||
506 | npages = num_dma_pages(dma_handle, size); | 479 | npages = num_dma_pages(dma_handle, size); |
507 | iommu_free(tbl, dma_handle, npages); | 480 | iommu_free(tbl, dma_handle, npages); |
508 | } | 481 | } |
@@ -525,18 +498,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, | |||
525 | goto error; | 498 | goto error; |
526 | memset(ret, 0, size); | 499 | memset(ret, 0, size); |
527 | 500 | ||
528 | if (translation_enabled(tbl)) { | 501 | /* set up tces to cover the allocated range */ |
529 | /* set up tces to cover the allocated range */ | 502 | mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); |
530 | mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); | 503 | if (mapping == bad_dma_address) |
531 | if (mapping == bad_dma_address) | 504 | goto free; |
532 | goto free; | 505 | *dma_handle = mapping; |
533 | |||
534 | *dma_handle = mapping; | ||
535 | } else /* non translated slot */ | ||
536 | *dma_handle = virt_to_bus(ret); | ||
537 | |||
538 | return ret; | 506 | return ret; |
539 | |||
540 | free: | 507 | free: |
541 | free_pages((unsigned long)ret, get_order(size)); | 508 | free_pages((unsigned long)ret, get_order(size)); |
542 | ret = NULL; | 509 | ret = NULL; |
@@ -544,7 +511,7 @@ error: | |||
544 | return ret; | 511 | return ret; |
545 | } | 512 | } |
546 | 513 | ||
547 | static const struct dma_mapping_ops calgary_dma_ops = { | 514 | static struct dma_mapping_ops calgary_dma_ops = { |
548 | .alloc_coherent = calgary_alloc_coherent, | 515 | .alloc_coherent = calgary_alloc_coherent, |
549 | .map_single = calgary_map_single, | 516 | .map_single = calgary_map_single, |
550 | .unmap_single = calgary_unmap_single, | 517 | .unmap_single = calgary_unmap_single, |
@@ -1241,6 +1208,16 @@ static int __init calgary_init(void) | |||
1241 | goto error; | 1208 | goto error; |
1242 | } while (1); | 1209 | } while (1); |
1243 | 1210 | ||
1211 | dev = NULL; | ||
1212 | for_each_pci_dev(dev) { | ||
1213 | struct iommu_table *tbl; | ||
1214 | |||
1215 | tbl = find_iommu_table(&dev->dev); | ||
1216 | |||
1217 | if (translation_enabled(tbl)) | ||
1218 | dev->dev.archdata.dma_ops = &calgary_dma_ops; | ||
1219 | } | ||
1220 | |||
1244 | return ret; | 1221 | return ret; |
1245 | 1222 | ||
1246 | error: | 1223 | error: |
@@ -1262,6 +1239,7 @@ error: | |||
1262 | calgary_disable_translation(dev); | 1239 | calgary_disable_translation(dev); |
1263 | calgary_free_bus(dev); | 1240 | calgary_free_bus(dev); |
1264 | pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ | 1241 | pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ |
1242 | dev->dev.archdata.dma_ops = NULL; | ||
1265 | } while (1); | 1243 | } while (1); |
1266 | 1244 | ||
1267 | return ret; | 1245 | return ret; |
@@ -1503,6 +1481,10 @@ void __init detect_calgary(void) | |||
1503 | printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " | 1481 | printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " |
1504 | "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, | 1482 | "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, |
1505 | debugging ? "enabled" : "disabled"); | 1483 | debugging ? "enabled" : "disabled"); |
1484 | |||
1485 | /* swiotlb for devices that aren't behind the Calgary. */ | ||
1486 | if (max_pfn > MAX_DMA32_PFN) | ||
1487 | swiotlb = 1; | ||
1506 | } | 1488 | } |
1507 | return; | 1489 | return; |
1508 | 1490 | ||
@@ -1519,7 +1501,7 @@ int __init calgary_iommu_init(void) | |||
1519 | { | 1501 | { |
1520 | int ret; | 1502 | int ret; |
1521 | 1503 | ||
1522 | if (no_iommu || swiotlb) | 1504 | if (no_iommu || (swiotlb && !calgary_detected)) |
1523 | return -ENODEV; | 1505 | return -ENODEV; |
1524 | 1506 | ||
1525 | if (!calgary_detected) | 1507 | if (!calgary_detected) |
@@ -1532,15 +1514,14 @@ int __init calgary_iommu_init(void) | |||
1532 | if (ret) { | 1514 | if (ret) { |
1533 | printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " | 1515 | printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " |
1534 | "falling back to no_iommu\n", ret); | 1516 | "falling back to no_iommu\n", ret); |
1535 | if (max_pfn > MAX_DMA32_PFN) | ||
1536 | printk(KERN_ERR "WARNING more than 4GB of memory, " | ||
1537 | "32bit PCI may malfunction.\n"); | ||
1538 | return ret; | 1517 | return ret; |
1539 | } | 1518 | } |
1540 | 1519 | ||
1541 | force_iommu = 1; | 1520 | force_iommu = 1; |
1542 | bad_dma_address = 0x0; | 1521 | bad_dma_address = 0x0; |
1543 | dma_ops = &calgary_dma_ops; | 1522 | /* dma_ops is set to swiotlb or nommu */ |
1523 | if (!dma_ops) | ||
1524 | dma_ops = &nommu_dma_ops; | ||
1544 | 1525 | ||
1545 | return 0; | 1526 | return 0; |
1546 | } | 1527 | } |
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index cbecb05551bb..37544123896d 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c | |||
@@ -11,7 +11,7 @@ | |||
11 | 11 | ||
12 | static int forbid_dac __read_mostly; | 12 | static int forbid_dac __read_mostly; |
13 | 13 | ||
14 | const struct dma_mapping_ops *dma_ops; | 14 | struct dma_mapping_ops *dma_ops; |
15 | EXPORT_SYMBOL(dma_ops); | 15 | EXPORT_SYMBOL(dma_ops); |
16 | 16 | ||
17 | static int iommu_sac_force __read_mostly; | 17 | static int iommu_sac_force __read_mostly; |
@@ -312,6 +312,8 @@ static int dma_release_coherent(struct device *dev, int order, void *vaddr) | |||
312 | 312 | ||
313 | int dma_supported(struct device *dev, u64 mask) | 313 | int dma_supported(struct device *dev, u64 mask) |
314 | { | 314 | { |
315 | struct dma_mapping_ops *ops = get_dma_ops(dev); | ||
316 | |||
315 | #ifdef CONFIG_PCI | 317 | #ifdef CONFIG_PCI |
316 | if (mask > 0xffffffff && forbid_dac > 0) { | 318 | if (mask > 0xffffffff && forbid_dac > 0) { |
317 | dev_info(dev, "PCI: Disallowing DAC for device\n"); | 319 | dev_info(dev, "PCI: Disallowing DAC for device\n"); |
@@ -319,8 +321,8 @@ int dma_supported(struct device *dev, u64 mask) | |||
319 | } | 321 | } |
320 | #endif | 322 | #endif |
321 | 323 | ||
322 | if (dma_ops->dma_supported) | 324 | if (ops->dma_supported) |
323 | return dma_ops->dma_supported(dev, mask); | 325 | return ops->dma_supported(dev, mask); |
324 | 326 | ||
325 | /* Copied from i386. Doesn't make much sense, because it will | 327 | /* Copied from i386. Doesn't make much sense, because it will |
326 | only work for pci_alloc_coherent. | 328 | only work for pci_alloc_coherent. |
@@ -367,6 +369,7 @@ void * | |||
367 | dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, | 369 | dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, |
368 | gfp_t gfp) | 370 | gfp_t gfp) |
369 | { | 371 | { |
372 | struct dma_mapping_ops *ops = get_dma_ops(dev); | ||
370 | void *memory = NULL; | 373 | void *memory = NULL; |
371 | struct page *page; | 374 | struct page *page; |
372 | unsigned long dma_mask = 0; | 375 | unsigned long dma_mask = 0; |
@@ -435,8 +438,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, | |||
435 | /* Let low level make its own zone decisions */ | 438 | /* Let low level make its own zone decisions */ |
436 | gfp &= ~(GFP_DMA32|GFP_DMA); | 439 | gfp &= ~(GFP_DMA32|GFP_DMA); |
437 | 440 | ||
438 | if (dma_ops->alloc_coherent) | 441 | if (ops->alloc_coherent) |
439 | return dma_ops->alloc_coherent(dev, size, | 442 | return ops->alloc_coherent(dev, size, |
440 | dma_handle, gfp); | 443 | dma_handle, gfp); |
441 | return NULL; | 444 | return NULL; |
442 | } | 445 | } |
@@ -448,14 +451,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, | |||
448 | } | 451 | } |
449 | } | 452 | } |
450 | 453 | ||
451 | if (dma_ops->alloc_coherent) { | 454 | if (ops->alloc_coherent) { |
452 | free_pages((unsigned long)memory, get_order(size)); | 455 | free_pages((unsigned long)memory, get_order(size)); |
453 | gfp &= ~(GFP_DMA|GFP_DMA32); | 456 | gfp &= ~(GFP_DMA|GFP_DMA32); |
454 | return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); | 457 | return ops->alloc_coherent(dev, size, dma_handle, gfp); |
455 | } | 458 | } |
456 | 459 | ||
457 | if (dma_ops->map_simple) { | 460 | if (ops->map_simple) { |
458 | *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory), | 461 | *dma_handle = ops->map_simple(dev, virt_to_phys(memory), |
459 | size, | 462 | size, |
460 | PCI_DMA_BIDIRECTIONAL); | 463 | PCI_DMA_BIDIRECTIONAL); |
461 | if (*dma_handle != bad_dma_address) | 464 | if (*dma_handle != bad_dma_address) |
@@ -477,12 +480,14 @@ EXPORT_SYMBOL(dma_alloc_coherent); | |||
477 | void dma_free_coherent(struct device *dev, size_t size, | 480 | void dma_free_coherent(struct device *dev, size_t size, |
478 | void *vaddr, dma_addr_t bus) | 481 | void *vaddr, dma_addr_t bus) |
479 | { | 482 | { |
483 | struct dma_mapping_ops *ops = get_dma_ops(dev); | ||
484 | |||
480 | int order = get_order(size); | 485 | int order = get_order(size); |
481 | WARN_ON(irqs_disabled()); /* for portability */ | 486 | WARN_ON(irqs_disabled()); /* for portability */ |
482 | if (dma_release_coherent(dev, order, vaddr)) | 487 | if (dma_release_coherent(dev, order, vaddr)) |
483 | return; | 488 | return; |
484 | if (dma_ops->unmap_single) | 489 | if (ops->unmap_single) |
485 | dma_ops->unmap_single(dev, bus, size, 0); | 490 | ops->unmap_single(dev, bus, size, 0); |
486 | free_pages((unsigned long)vaddr, order); | 491 | free_pages((unsigned long)vaddr, order); |
487 | } | 492 | } |
488 | EXPORT_SYMBOL(dma_free_coherent); | 493 | EXPORT_SYMBOL(dma_free_coherent); |
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index df5f142657d2..744126e64950 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c | |||
@@ -692,8 +692,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) | |||
692 | 692 | ||
693 | extern int agp_amd64_init(void); | 693 | extern int agp_amd64_init(void); |
694 | 694 | ||
695 | static const struct dma_mapping_ops gart_dma_ops = { | 695 | static struct dma_mapping_ops gart_dma_ops = { |
696 | .mapping_error = NULL, | ||
697 | .map_single = gart_map_single, | 696 | .map_single = gart_map_single, |
698 | .map_simple = gart_map_simple, | 697 | .map_simple = gart_map_simple, |
699 | .unmap_single = gart_unmap_single, | 698 | .unmap_single = gart_unmap_single, |
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 792b9179eff3..3f91f71cdc3e 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c | |||
@@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, | |||
72 | return nents; | 72 | return nents; |
73 | } | 73 | } |
74 | 74 | ||
75 | /* Make sure we keep the same behaviour */ | 75 | struct dma_mapping_ops nommu_dma_ops = { |
76 | static int nommu_mapping_error(dma_addr_t dma_addr) | ||
77 | { | ||
78 | #ifdef CONFIG_X86_32 | ||
79 | return 0; | ||
80 | #else | ||
81 | return (dma_addr == bad_dma_address); | ||
82 | #endif | ||
83 | } | ||
84 | |||
85 | |||
86 | const struct dma_mapping_ops nommu_dma_ops = { | ||
87 | .map_single = nommu_map_single, | 76 | .map_single = nommu_map_single, |
88 | .map_sg = nommu_map_sg, | 77 | .map_sg = nommu_map_sg, |
89 | .mapping_error = nommu_mapping_error, | ||
90 | .is_phys = 1, | 78 | .is_phys = 1, |
91 | }; | 79 | }; |
92 | 80 | ||
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 20df839b9c20..c4ce0332759e 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c | |||
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, | |||
18 | return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); | 18 | return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); |
19 | } | 19 | } |
20 | 20 | ||
21 | const struct dma_mapping_ops swiotlb_dma_ops = { | 21 | struct dma_mapping_ops swiotlb_dma_ops = { |
22 | .mapping_error = swiotlb_dma_mapping_error, | 22 | .mapping_error = swiotlb_dma_mapping_error, |
23 | .alloc_coherent = swiotlb_alloc_coherent, | 23 | .alloc_coherent = swiotlb_alloc_coherent, |
24 | .free_coherent = swiotlb_free_coherent, | 24 | .free_coherent = swiotlb_free_coherent, |
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index c30fe25d470d..703310a99023 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S | |||
@@ -20,11 +20,44 @@ | |||
20 | #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) | 20 | #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) |
21 | #define PAE_PGD_ATTR (_PAGE_PRESENT) | 21 | #define PAE_PGD_ATTR (_PAGE_PRESENT) |
22 | 22 | ||
23 | /* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are | ||
24 | * used to save some data for jumping back | ||
25 | */ | ||
26 | #define DATA(offset) (PAGE_SIZE/2+(offset)) | ||
27 | |||
28 | /* Minimal CPU state */ | ||
29 | #define ESP DATA(0x0) | ||
30 | #define CR0 DATA(0x4) | ||
31 | #define CR3 DATA(0x8) | ||
32 | #define CR4 DATA(0xc) | ||
33 | |||
34 | /* other data */ | ||
35 | #define CP_VA_CONTROL_PAGE DATA(0x10) | ||
36 | #define CP_PA_PGD DATA(0x14) | ||
37 | #define CP_PA_SWAP_PAGE DATA(0x18) | ||
38 | #define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) | ||
39 | |||
23 | .text | 40 | .text |
24 | .align PAGE_SIZE | 41 | .align PAGE_SIZE |
25 | .globl relocate_kernel | 42 | .globl relocate_kernel |
26 | relocate_kernel: | 43 | relocate_kernel: |
27 | movl 8(%esp), %ebp /* list of pages */ | 44 | /* Save the CPU context, used for jumping back */ |
45 | |||
46 | pushl %ebx | ||
47 | pushl %esi | ||
48 | pushl %edi | ||
49 | pushl %ebp | ||
50 | pushf | ||
51 | |||
52 | movl 20+8(%esp), %ebp /* list of pages */ | ||
53 | movl PTR(VA_CONTROL_PAGE)(%ebp), %edi | ||
54 | movl %esp, ESP(%edi) | ||
55 | movl %cr0, %eax | ||
56 | movl %eax, CR0(%edi) | ||
57 | movl %cr3, %eax | ||
58 | movl %eax, CR3(%edi) | ||
59 | movl %cr4, %eax | ||
60 | movl %eax, CR4(%edi) | ||
28 | 61 | ||
29 | #ifdef CONFIG_X86_PAE | 62 | #ifdef CONFIG_X86_PAE |
30 | /* map the control page at its virtual address */ | 63 | /* map the control page at its virtual address */ |
@@ -138,15 +171,25 @@ relocate_kernel: | |||
138 | 171 | ||
139 | relocate_new_kernel: | 172 | relocate_new_kernel: |
140 | /* read the arguments and say goodbye to the stack */ | 173 | /* read the arguments and say goodbye to the stack */ |
141 | movl 4(%esp), %ebx /* page_list */ | 174 | movl 20+4(%esp), %ebx /* page_list */ |
142 | movl 8(%esp), %ebp /* list of pages */ | 175 | movl 20+8(%esp), %ebp /* list of pages */ |
143 | movl 12(%esp), %edx /* start address */ | 176 | movl 20+12(%esp), %edx /* start address */ |
144 | movl 16(%esp), %ecx /* cpu_has_pae */ | 177 | movl 20+16(%esp), %ecx /* cpu_has_pae */ |
178 | movl 20+20(%esp), %esi /* preserve_context */ | ||
145 | 179 | ||
146 | /* zero out flags, and disable interrupts */ | 180 | /* zero out flags, and disable interrupts */ |
147 | pushl $0 | 181 | pushl $0 |
148 | popfl | 182 | popfl |
149 | 183 | ||
184 | /* save some information for jumping back */ | ||
185 | movl PTR(VA_CONTROL_PAGE)(%ebp), %edi | ||
186 | movl %edi, CP_VA_CONTROL_PAGE(%edi) | ||
187 | movl PTR(PA_PGD)(%ebp), %eax | ||
188 | movl %eax, CP_PA_PGD(%edi) | ||
189 | movl PTR(PA_SWAP_PAGE)(%ebp), %eax | ||
190 | movl %eax, CP_PA_SWAP_PAGE(%edi) | ||
191 | movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) | ||
192 | |||
150 | /* get physical address of control page now */ | 193 | /* get physical address of control page now */ |
151 | /* this is impossible after page table switch */ | 194 | /* this is impossible after page table switch */ |
152 | movl PTR(PA_CONTROL_PAGE)(%ebp), %edi | 195 | movl PTR(PA_CONTROL_PAGE)(%ebp), %edi |
@@ -197,8 +240,90 @@ identity_mapped: | |||
197 | xorl %eax, %eax | 240 | xorl %eax, %eax |
198 | movl %eax, %cr3 | 241 | movl %eax, %cr3 |
199 | 242 | ||
243 | movl CP_PA_SWAP_PAGE(%edi), %eax | ||
244 | pushl %eax | ||
245 | pushl %ebx | ||
246 | call swap_pages | ||
247 | addl $8, %esp | ||
248 | |||
249 | /* To be certain of avoiding problems with self-modifying code | ||
250 | * I need to execute a serializing instruction here. | ||
251 | * So I flush the TLB, it's handy, and not processor dependent. | ||
252 | */ | ||
253 | xorl %eax, %eax | ||
254 | movl %eax, %cr3 | ||
255 | |||
256 | /* set all of the registers to known values */ | ||
257 | /* leave %esp alone */ | ||
258 | |||
259 | testl %esi, %esi | ||
260 | jnz 1f | ||
261 | xorl %edi, %edi | ||
262 | xorl %eax, %eax | ||
263 | xorl %ebx, %ebx | ||
264 | xorl %ecx, %ecx | ||
265 | xorl %edx, %edx | ||
266 | xorl %esi, %esi | ||
267 | xorl %ebp, %ebp | ||
268 | ret | ||
269 | 1: | ||
270 | popl %edx | ||
271 | movl CP_PA_SWAP_PAGE(%edi), %esp | ||
272 | addl $PAGE_SIZE, %esp | ||
273 | 2: | ||
274 | call *%edx | ||
275 | |||
276 | /* get the re-entry point of the peer system */ | ||
277 | movl 0(%esp), %ebp | ||
278 | call 1f | ||
279 | 1: | ||
280 | popl %ebx | ||
281 | subl $(1b - relocate_kernel), %ebx | ||
282 | movl CP_VA_CONTROL_PAGE(%ebx), %edi | ||
283 | lea PAGE_SIZE(%ebx), %esp | ||
284 | movl CP_PA_SWAP_PAGE(%ebx), %eax | ||
285 | movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx | ||
286 | pushl %eax | ||
287 | pushl %edx | ||
288 | call swap_pages | ||
289 | addl $8, %esp | ||
290 | movl CP_PA_PGD(%ebx), %eax | ||
291 | movl %eax, %cr3 | ||
292 | movl %cr0, %eax | ||
293 | orl $(1<<31), %eax | ||
294 | movl %eax, %cr0 | ||
295 | lea PAGE_SIZE(%edi), %esp | ||
296 | movl %edi, %eax | ||
297 | addl $(virtual_mapped - relocate_kernel), %eax | ||
298 | pushl %eax | ||
299 | ret | ||
300 | |||
301 | virtual_mapped: | ||
302 | movl CR4(%edi), %eax | ||
303 | movl %eax, %cr4 | ||
304 | movl CR3(%edi), %eax | ||
305 | movl %eax, %cr3 | ||
306 | movl CR0(%edi), %eax | ||
307 | movl %eax, %cr0 | ||
308 | movl ESP(%edi), %esp | ||
309 | movl %ebp, %eax | ||
310 | |||
311 | popf | ||
312 | popl %ebp | ||
313 | popl %edi | ||
314 | popl %esi | ||
315 | popl %ebx | ||
316 | ret | ||
317 | |||
200 | /* Do the copies */ | 318 | /* Do the copies */ |
201 | movl %ebx, %ecx | 319 | swap_pages: |
320 | movl 8(%esp), %edx | ||
321 | movl 4(%esp), %ecx | ||
322 | pushl %ebp | ||
323 | pushl %ebx | ||
324 | pushl %edi | ||
325 | pushl %esi | ||
326 | movl %ecx, %ebx | ||
202 | jmp 1f | 327 | jmp 1f |
203 | 328 | ||
204 | 0: /* top, read another word from the indirection page */ | 329 | 0: /* top, read another word from the indirection page */ |
@@ -226,27 +351,28 @@ identity_mapped: | |||
226 | movl %ecx, %esi /* For every source page do a copy */ | 351 | movl %ecx, %esi /* For every source page do a copy */ |
227 | andl $0xfffff000, %esi | 352 | andl $0xfffff000, %esi |
228 | 353 | ||
354 | movl %edi, %eax | ||
355 | movl %esi, %ebp | ||
356 | |||
357 | movl %edx, %edi | ||
229 | movl $1024, %ecx | 358 | movl $1024, %ecx |
230 | rep ; movsl | 359 | rep ; movsl |
231 | jmp 0b | ||
232 | 360 | ||
233 | 3: | 361 | movl %ebp, %edi |
234 | 362 | movl %eax, %esi | |
235 | /* To be certain of avoiding problems with self-modifying code | 363 | movl $1024, %ecx |
236 | * I need to execute a serializing instruction here. | 364 | rep ; movsl |
237 | * So I flush the TLB, it's handy, and not processor dependent. | ||
238 | */ | ||
239 | xorl %eax, %eax | ||
240 | movl %eax, %cr3 | ||
241 | 365 | ||
242 | /* set all of the registers to known values */ | 366 | movl %eax, %edi |
243 | /* leave %esp alone */ | 367 | movl %edx, %esi |
368 | movl $1024, %ecx | ||
369 | rep ; movsl | ||
244 | 370 | ||
245 | xorl %eax, %eax | 371 | lea PAGE_SIZE(%ebp), %esi |
246 | xorl %ebx, %ebx | 372 | jmp 0b |
247 | xorl %ecx, %ecx | 373 | 3: |
248 | xorl %edx, %edx | 374 | popl %esi |
249 | xorl %esi, %esi | 375 | popl %edi |
250 | xorl %edi, %edi | 376 | popl %ebx |
251 | xorl %ebp, %ebp | 377 | popl %ebp |
252 | ret | 378 | ret |
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 1fbb844c3d7a..2977ea37791f 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -1,6 +1,7 @@ | |||
1 | obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ | 1 | obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ |
2 | pat.o pgtable.o | 2 | pat.o pgtable.o |
3 | 3 | ||
4 | obj-$(CONFIG_HAVE_GET_USER_PAGES_FAST) += gup.o | ||
4 | obj-$(CONFIG_X86_32) += pgtable_32.o | 5 | obj-$(CONFIG_X86_32) += pgtable_32.o |
5 | 6 | ||
6 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o | 7 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o |
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c new file mode 100644 index 000000000000..3085f25b4355 --- /dev/null +++ b/arch/x86/mm/gup.c | |||
@@ -0,0 +1,295 @@ | |||
1 | /* | ||
2 | * Lockless get_user_pages_fast for x86 | ||
3 | * | ||
4 | * Copyright (C) 2008 Nick Piggin | ||
5 | * Copyright (C) 2008 Novell Inc. | ||
6 | */ | ||
7 | #include <linux/sched.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/vmstat.h> | ||
10 | #include <linux/highmem.h> | ||
11 | |||
12 | #include <asm/pgtable.h> | ||
13 | |||
14 | static inline pte_t gup_get_pte(pte_t *ptep) | ||
15 | { | ||
16 | #ifndef CONFIG_X86_PAE | ||
17 | return *ptep; | ||
18 | #else | ||
19 | /* | ||
20 | * With get_user_pages_fast, we walk down the pagetables without taking | ||
21 | * any locks. For this we would like to load the pointers atoimcally, | ||
22 | * but that is not possible (without expensive cmpxchg8b) on PAE. What | ||
23 | * we do have is the guarantee that a pte will only either go from not | ||
24 | * present to present, or present to not present or both -- it will not | ||
25 | * switch to a completely different present page without a TLB flush in | ||
26 | * between; something that we are blocking by holding interrupts off. | ||
27 | * | ||
28 | * Setting ptes from not present to present goes: | ||
29 | * ptep->pte_high = h; | ||
30 | * smp_wmb(); | ||
31 | * ptep->pte_low = l; | ||
32 | * | ||
33 | * And present to not present goes: | ||
34 | * ptep->pte_low = 0; | ||
35 | * smp_wmb(); | ||
36 | * ptep->pte_high = 0; | ||
37 | * | ||
38 | * We must ensure here that the load of pte_low sees l iff pte_high | ||
39 | * sees h. We load pte_high *after* loading pte_low, which ensures we | ||
40 | * don't see an older value of pte_high. *Then* we recheck pte_low, | ||
41 | * which ensures that we haven't picked up a changed pte high. We might | ||
42 | * have got rubbish values from pte_low and pte_high, but we are | ||
43 | * guaranteed that pte_low will not have the present bit set *unless* | ||
44 | * it is 'l'. And get_user_pages_fast only operates on present ptes, so | ||
45 | * we're safe. | ||
46 | * | ||
47 | * gup_get_pte should not be used or copied outside gup.c without being | ||
48 | * very careful -- it does not atomically load the pte or anything that | ||
49 | * is likely to be useful for you. | ||
50 | */ | ||
51 | pte_t pte; | ||
52 | |||
53 | retry: | ||
54 | pte.pte_low = ptep->pte_low; | ||
55 | smp_rmb(); | ||
56 | pte.pte_high = ptep->pte_high; | ||
57 | smp_rmb(); | ||
58 | if (unlikely(pte.pte_low != ptep->pte_low)) | ||
59 | goto retry; | ||
60 | |||
61 | return pte; | ||
62 | #endif | ||
63 | } | ||
64 | |||
65 | /* | ||
66 | * The performance critical leaf functions are made noinline otherwise gcc | ||
67 | * inlines everything into a single function which results in too much | ||
68 | * register pressure. | ||
69 | */ | ||
70 | static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | ||
71 | unsigned long end, int write, struct page **pages, int *nr) | ||
72 | { | ||
73 | unsigned long mask; | ||
74 | pte_t *ptep; | ||
75 | |||
76 | mask = _PAGE_PRESENT|_PAGE_USER; | ||
77 | if (write) | ||
78 | mask |= _PAGE_RW; | ||
79 | |||
80 | ptep = pte_offset_map(&pmd, addr); | ||
81 | do { | ||
82 | pte_t pte = gup_get_pte(ptep); | ||
83 | struct page *page; | ||
84 | |||
85 | if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) { | ||
86 | pte_unmap(ptep); | ||
87 | return 0; | ||
88 | } | ||
89 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
90 | page = pte_page(pte); | ||
91 | get_page(page); | ||
92 | pages[*nr] = page; | ||
93 | (*nr)++; | ||
94 | |||
95 | } while (ptep++, addr += PAGE_SIZE, addr != end); | ||
96 | pte_unmap(ptep - 1); | ||
97 | |||
98 | return 1; | ||
99 | } | ||
100 | |||
101 | static inline void get_head_page_multiple(struct page *page, int nr) | ||
102 | { | ||
103 | VM_BUG_ON(page != compound_head(page)); | ||
104 | VM_BUG_ON(page_count(page) == 0); | ||
105 | atomic_add(nr, &page->_count); | ||
106 | } | ||
107 | |||
108 | static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | ||
109 | unsigned long end, int write, struct page **pages, int *nr) | ||
110 | { | ||
111 | unsigned long mask; | ||
112 | pte_t pte = *(pte_t *)&pmd; | ||
113 | struct page *head, *page; | ||
114 | int refs; | ||
115 | |||
116 | mask = _PAGE_PRESENT|_PAGE_USER; | ||
117 | if (write) | ||
118 | mask |= _PAGE_RW; | ||
119 | if ((pte_val(pte) & mask) != mask) | ||
120 | return 0; | ||
121 | /* hugepages are never "special" */ | ||
122 | VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); | ||
123 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
124 | |||
125 | refs = 0; | ||
126 | head = pte_page(pte); | ||
127 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
128 | do { | ||
129 | VM_BUG_ON(compound_head(page) != head); | ||
130 | pages[*nr] = page; | ||
131 | (*nr)++; | ||
132 | page++; | ||
133 | refs++; | ||
134 | } while (addr += PAGE_SIZE, addr != end); | ||
135 | get_head_page_multiple(head, refs); | ||
136 | |||
137 | return 1; | ||
138 | } | ||
139 | |||
140 | static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | ||
141 | int write, struct page **pages, int *nr) | ||
142 | { | ||
143 | unsigned long next; | ||
144 | pmd_t *pmdp; | ||
145 | |||
146 | pmdp = pmd_offset(&pud, addr); | ||
147 | do { | ||
148 | pmd_t pmd = *pmdp; | ||
149 | |||
150 | next = pmd_addr_end(addr, end); | ||
151 | if (pmd_none(pmd)) | ||
152 | return 0; | ||
153 | if (unlikely(pmd_large(pmd))) { | ||
154 | if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) | ||
155 | return 0; | ||
156 | } else { | ||
157 | if (!gup_pte_range(pmd, addr, next, write, pages, nr)) | ||
158 | return 0; | ||
159 | } | ||
160 | } while (pmdp++, addr = next, addr != end); | ||
161 | |||
162 | return 1; | ||
163 | } | ||
164 | |||
165 | static noinline int gup_huge_pud(pud_t pud, unsigned long addr, | ||
166 | unsigned long end, int write, struct page **pages, int *nr) | ||
167 | { | ||
168 | unsigned long mask; | ||
169 | pte_t pte = *(pte_t *)&pud; | ||
170 | struct page *head, *page; | ||
171 | int refs; | ||
172 | |||
173 | mask = _PAGE_PRESENT|_PAGE_USER; | ||
174 | if (write) | ||
175 | mask |= _PAGE_RW; | ||
176 | if ((pte_val(pte) & mask) != mask) | ||
177 | return 0; | ||
178 | /* hugepages are never "special" */ | ||
179 | VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); | ||
180 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
181 | |||
182 | refs = 0; | ||
183 | head = pte_page(pte); | ||
184 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | ||
185 | do { | ||
186 | VM_BUG_ON(compound_head(page) != head); | ||
187 | pages[*nr] = page; | ||
188 | (*nr)++; | ||
189 | page++; | ||
190 | refs++; | ||
191 | } while (addr += PAGE_SIZE, addr != end); | ||
192 | get_head_page_multiple(head, refs); | ||
193 | |||
194 | return 1; | ||
195 | } | ||
196 | |||
197 | static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, | ||
198 | int write, struct page **pages, int *nr) | ||
199 | { | ||
200 | unsigned long next; | ||
201 | pud_t *pudp; | ||
202 | |||
203 | pudp = pud_offset(&pgd, addr); | ||
204 | do { | ||
205 | pud_t pud = *pudp; | ||
206 | |||
207 | next = pud_addr_end(addr, end); | ||
208 | if (pud_none(pud)) | ||
209 | return 0; | ||
210 | if (unlikely(pud_large(pud))) { | ||
211 | if (!gup_huge_pud(pud, addr, next, write, pages, nr)) | ||
212 | return 0; | ||
213 | } else { | ||
214 | if (!gup_pmd_range(pud, addr, next, write, pages, nr)) | ||
215 | return 0; | ||
216 | } | ||
217 | } while (pudp++, addr = next, addr != end); | ||
218 | |||
219 | return 1; | ||
220 | } | ||
221 | |||
222 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
223 | struct page **pages) | ||
224 | { | ||
225 | struct mm_struct *mm = current->mm; | ||
226 | unsigned long end = start + (nr_pages << PAGE_SHIFT); | ||
227 | unsigned long addr = start; | ||
228 | unsigned long next; | ||
229 | pgd_t *pgdp; | ||
230 | int nr = 0; | ||
231 | |||
232 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | ||
233 | start, nr_pages*PAGE_SIZE))) | ||
234 | goto slow_irqon; | ||
235 | |||
236 | /* | ||
237 | * XXX: batch / limit 'nr', to avoid large irq off latency | ||
238 | * needs some instrumenting to determine the common sizes used by | ||
239 | * important workloads (eg. DB2), and whether limiting the batch size | ||
240 | * will decrease performance. | ||
241 | * | ||
242 | * It seems like we're in the clear for the moment. Direct-IO is | ||
243 | * the main guy that batches up lots of get_user_pages, and even | ||
244 | * they are limited to 64-at-a-time which is not so many. | ||
245 | */ | ||
246 | /* | ||
247 | * This doesn't prevent pagetable teardown, but does prevent | ||
248 | * the pagetables and pages from being freed on x86. | ||
249 | * | ||
250 | * So long as we atomically load page table pointers versus teardown | ||
251 | * (which we do on x86, with the above PAE exception), we can follow the | ||
252 | * address down to the the page and take a ref on it. | ||
253 | */ | ||
254 | local_irq_disable(); | ||
255 | pgdp = pgd_offset(mm, addr); | ||
256 | do { | ||
257 | pgd_t pgd = *pgdp; | ||
258 | |||
259 | next = pgd_addr_end(addr, end); | ||
260 | if (pgd_none(pgd)) | ||
261 | goto slow; | ||
262 | if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) | ||
263 | goto slow; | ||
264 | } while (pgdp++, addr = next, addr != end); | ||
265 | local_irq_enable(); | ||
266 | |||
267 | VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); | ||
268 | return nr; | ||
269 | |||
270 | { | ||
271 | int ret; | ||
272 | |||
273 | slow: | ||
274 | local_irq_enable(); | ||
275 | slow_irqon: | ||
276 | /* Try to get the remaining pages with get_user_pages */ | ||
277 | start += nr << PAGE_SHIFT; | ||
278 | pages += nr; | ||
279 | |||
280 | down_read(&mm->mmap_sem); | ||
281 | ret = get_user_pages(current, mm, start, | ||
282 | (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); | ||
283 | up_read(&mm->mmap_sem); | ||
284 | |||
285 | /* Have to be a bit careful with return values */ | ||
286 | if (nr > 0) { | ||
287 | if (ret < 0) | ||
288 | ret = nr; | ||
289 | else | ||
290 | ret += nr; | ||
291 | } | ||
292 | |||
293 | return ret; | ||
294 | } | ||
295 | } | ||
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ec37121f6709..129618ca0ea2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -86,43 +86,6 @@ early_param("gbpages", parse_direct_gbpages_on); | |||
86 | * around without checking the pgd every time. | 86 | * around without checking the pgd every time. |
87 | */ | 87 | */ |
88 | 88 | ||
89 | void show_mem(void) | ||
90 | { | ||
91 | long i, total = 0, reserved = 0; | ||
92 | long shared = 0, cached = 0; | ||
93 | struct page *page; | ||
94 | pg_data_t *pgdat; | ||
95 | |||
96 | printk(KERN_INFO "Mem-info:\n"); | ||
97 | show_free_areas(); | ||
98 | for_each_online_pgdat(pgdat) { | ||
99 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | ||
100 | /* | ||
101 | * This loop can take a while with 256 GB and | ||
102 | * 4k pages so defer the NMI watchdog: | ||
103 | */ | ||
104 | if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) | ||
105 | touch_nmi_watchdog(); | ||
106 | |||
107 | if (!pfn_valid(pgdat->node_start_pfn + i)) | ||
108 | continue; | ||
109 | |||
110 | page = pfn_to_page(pgdat->node_start_pfn + i); | ||
111 | total++; | ||
112 | if (PageReserved(page)) | ||
113 | reserved++; | ||
114 | else if (PageSwapCache(page)) | ||
115 | cached++; | ||
116 | else if (page_count(page)) | ||
117 | shared += page_count(page) - 1; | ||
118 | } | ||
119 | } | ||
120 | printk(KERN_INFO "%lu pages of RAM\n", total); | ||
121 | printk(KERN_INFO "%lu reserved pages\n", reserved); | ||
122 | printk(KERN_INFO "%lu pages shared\n", shared); | ||
123 | printk(KERN_INFO "%lu pages swap cached\n", cached); | ||
124 | } | ||
125 | |||
126 | int after_bootmem; | 89 | int after_bootmem; |
127 | 90 | ||
128 | static __init void *spp_getpage(void) | 91 | static __init void *spp_getpage(void) |
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index b4becbf8c570..cab0abbd1ebe 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c | |||
@@ -20,53 +20,6 @@ | |||
20 | #include <asm/tlb.h> | 20 | #include <asm/tlb.h> |
21 | #include <asm/tlbflush.h> | 21 | #include <asm/tlbflush.h> |
22 | 22 | ||
23 | void show_mem(void) | ||
24 | { | ||
25 | int total = 0, reserved = 0; | ||
26 | int shared = 0, cached = 0; | ||
27 | int highmem = 0; | ||
28 | struct page *page; | ||
29 | pg_data_t *pgdat; | ||
30 | unsigned long i; | ||
31 | unsigned long flags; | ||
32 | |||
33 | printk(KERN_INFO "Mem-info:\n"); | ||
34 | show_free_areas(); | ||
35 | for_each_online_pgdat(pgdat) { | ||
36 | pgdat_resize_lock(pgdat, &flags); | ||
37 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | ||
38 | if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) | ||
39 | touch_nmi_watchdog(); | ||
40 | page = pgdat_page_nr(pgdat, i); | ||
41 | total++; | ||
42 | if (PageHighMem(page)) | ||
43 | highmem++; | ||
44 | if (PageReserved(page)) | ||
45 | reserved++; | ||
46 | else if (PageSwapCache(page)) | ||
47 | cached++; | ||
48 | else if (page_count(page)) | ||
49 | shared += page_count(page) - 1; | ||
50 | } | ||
51 | pgdat_resize_unlock(pgdat, &flags); | ||
52 | } | ||
53 | printk(KERN_INFO "%d pages of RAM\n", total); | ||
54 | printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); | ||
55 | printk(KERN_INFO "%d reserved pages\n", reserved); | ||
56 | printk(KERN_INFO "%d pages shared\n", shared); | ||
57 | printk(KERN_INFO "%d pages swap cached\n", cached); | ||
58 | |||
59 | printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY)); | ||
60 | printk(KERN_INFO "%lu pages writeback\n", | ||
61 | global_page_state(NR_WRITEBACK)); | ||
62 | printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED)); | ||
63 | printk(KERN_INFO "%lu pages slab\n", | ||
64 | global_page_state(NR_SLAB_RECLAIMABLE) + | ||
65 | global_page_state(NR_SLAB_UNRECLAIMABLE)); | ||
66 | printk(KERN_INFO "%lu pages pagetables\n", | ||
67 | global_page_state(NR_PAGETABLE)); | ||
68 | } | ||
69 | |||
70 | /* | 23 | /* |
71 | * Associate a virtual page frame with a given physical page frame | 24 | * Associate a virtual page frame with a given physical page frame |
72 | * and protection flags for that frame. | 25 | * and protection flags for that frame. |