diff options
Diffstat (limited to 'arch/x86')
| -rw-r--r-- | arch/x86/mm/Makefile | 5 | ||||
| -rw-r--r-- | arch/x86/mm/Makefile_32 | 10 | ||||
| -rw-r--r-- | arch/x86/mm/boot_ioremap_32.c | 100 | ||||
| -rw-r--r-- | arch/x86/mm/discontig_32.c | 431 | ||||
| -rw-r--r-- | arch/x86/mm/extable_32.c | 35 | ||||
| -rw-r--r-- | arch/x86/mm/fault_32.c | 657 | ||||
| -rw-r--r-- | arch/x86/mm/highmem_32.c | 113 | ||||
| -rw-r--r-- | arch/x86/mm/hugetlbpage.c | 391 | ||||
| -rw-r--r-- | arch/x86/mm/init_32.c | 858 | ||||
| -rw-r--r-- | arch/x86/mm/ioremap_32.c | 274 | ||||
| -rw-r--r-- | arch/x86/mm/mmap_32.c | 77 | ||||
| -rw-r--r-- | arch/x86/mm/pageattr_32.c | 278 | ||||
| -rw-r--r-- | arch/x86/mm/pgtable_32.c | 373 |
13 files changed, 3602 insertions, 0 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile new file mode 100644 index 000000000000..7317648e6587 --- /dev/null +++ b/arch/x86/mm/Makefile | |||
| @@ -0,0 +1,5 @@ | |||
| 1 | ifeq ($(CONFIG_X86_32),y) | ||
| 2 | include ${srctree}/arch/x86/mm/Makefile_32 | ||
| 3 | else | ||
| 4 | include ${srctree}/arch/x86_64/mm/Makefile_64 | ||
| 5 | endif | ||
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32 new file mode 100644 index 000000000000..362b4ad082de --- /dev/null +++ b/arch/x86/mm/Makefile_32 | |||
| @@ -0,0 +1,10 @@ | |||
| 1 | # | ||
| 2 | # Makefile for the linux i386-specific parts of the memory manager. | ||
| 3 | # | ||
| 4 | |||
| 5 | obj-y := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o | ||
| 6 | |||
| 7 | obj-$(CONFIG_NUMA) += discontig_32.o | ||
| 8 | obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o | ||
| 9 | obj-$(CONFIG_HIGHMEM) += highmem_32.o | ||
| 10 | obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o | ||
diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c new file mode 100644 index 000000000000..4de95a17a7d4 --- /dev/null +++ b/arch/x86/mm/boot_ioremap_32.c | |||
| @@ -0,0 +1,100 @@ | |||
| 1 | /* | ||
| 2 | * arch/i386/mm/boot_ioremap.c | ||
| 3 | * | ||
| 4 | * Re-map functions for early boot-time before paging_init() when the | ||
| 5 | * boot-time pagetables are still in use | ||
| 6 | * | ||
| 7 | * Written by Dave Hansen <haveblue@us.ibm.com> | ||
| 8 | */ | ||
| 9 | |||
| 10 | |||
| 11 | /* | ||
| 12 | * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE | ||
| 13 | * keeps that from happenning. If anyone has a better way, I'm listening. | ||
| 14 | * | ||
| 15 | * boot_pte_t is defined only if this all works correctly | ||
| 16 | */ | ||
| 17 | |||
| 18 | #undef CONFIG_X86_PAE | ||
| 19 | #undef CONFIG_PARAVIRT | ||
| 20 | #include <asm/page.h> | ||
| 21 | #include <asm/pgtable.h> | ||
| 22 | #include <asm/tlbflush.h> | ||
| 23 | #include <linux/init.h> | ||
| 24 | #include <linux/stddef.h> | ||
| 25 | |||
| 26 | /* | ||
| 27 | * I'm cheating here. It is known that the two boot PTE pages are | ||
| 28 | * allocated next to each other. I'm pretending that they're just | ||
| 29 | * one big array. | ||
| 30 | */ | ||
| 31 | |||
| 32 | #define BOOT_PTE_PTRS (PTRS_PER_PTE*2) | ||
| 33 | |||
| 34 | static unsigned long boot_pte_index(unsigned long vaddr) | ||
| 35 | { | ||
| 36 | return __pa(vaddr) >> PAGE_SHIFT; | ||
| 37 | } | ||
| 38 | |||
| 39 | static inline boot_pte_t* boot_vaddr_to_pte(void *address) | ||
| 40 | { | ||
| 41 | boot_pte_t* boot_pg = (boot_pte_t*)pg0; | ||
| 42 | return &boot_pg[boot_pte_index((unsigned long)address)]; | ||
| 43 | } | ||
| 44 | |||
| 45 | /* | ||
| 46 | * This is only for a caller who is clever enough to page-align | ||
| 47 | * phys_addr and virtual_source, and who also has a preference | ||
| 48 | * about which virtual address from which to steal ptes | ||
| 49 | */ | ||
| 50 | static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages, | ||
| 51 | void* virtual_source) | ||
| 52 | { | ||
| 53 | boot_pte_t* pte; | ||
| 54 | int i; | ||
| 55 | char *vaddr = virtual_source; | ||
| 56 | |||
| 57 | pte = boot_vaddr_to_pte(virtual_source); | ||
| 58 | for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) { | ||
| 59 | set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL)); | ||
| 60 | __flush_tlb_one(&vaddr[i*PAGE_SIZE]); | ||
| 61 | } | ||
| 62 | } | ||
| 63 | |||
| 64 | /* the virtual space we're going to remap comes from this array */ | ||
| 65 | #define BOOT_IOREMAP_PAGES 4 | ||
| 66 | #define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE) | ||
| 67 | static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE] | ||
| 68 | __attribute__ ((aligned (PAGE_SIZE))); | ||
| 69 | |||
| 70 | /* | ||
| 71 | * This only applies to things which need to ioremap before paging_init() | ||
| 72 | * bt_ioremap() and plain ioremap() are both useless at this point. | ||
| 73 | * | ||
| 74 | * When used, we're still using the boot-time pagetables, which only | ||
| 75 | * have 2 PTE pages mapping the first 8MB | ||
| 76 | * | ||
| 77 | * There is no unmap. The boot-time PTE pages aren't used after boot. | ||
| 78 | * If you really want the space back, just remap it yourself. | ||
| 79 | * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE) | ||
| 80 | */ | ||
| 81 | __init void* boot_ioremap(unsigned long phys_addr, unsigned long size) | ||
| 82 | { | ||
| 83 | unsigned long last_addr, offset; | ||
| 84 | unsigned int nrpages; | ||
| 85 | |||
| 86 | last_addr = phys_addr + size - 1; | ||
| 87 | |||
| 88 | /* page align the requested address */ | ||
| 89 | offset = phys_addr & ~PAGE_MASK; | ||
| 90 | phys_addr &= PAGE_MASK; | ||
| 91 | size = PAGE_ALIGN(last_addr) - phys_addr; | ||
| 92 | |||
| 93 | nrpages = size >> PAGE_SHIFT; | ||
| 94 | if (nrpages > BOOT_IOREMAP_PAGES) | ||
| 95 | return NULL; | ||
| 96 | |||
| 97 | __boot_ioremap(phys_addr, nrpages, boot_ioremap_space); | ||
| 98 | |||
| 99 | return &boot_ioremap_space[offset]; | ||
| 100 | } | ||
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c new file mode 100644 index 000000000000..860e912a3fbb --- /dev/null +++ b/arch/x86/mm/discontig_32.c | |||
| @@ -0,0 +1,431 @@ | |||
| 1 | /* | ||
| 2 | * Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation | ||
| 3 | * August 2002: added remote node KVA remap - Martin J. Bligh | ||
| 4 | * | ||
| 5 | * Copyright (C) 2002, IBM Corp. | ||
| 6 | * | ||
| 7 | * All rights reserved. | ||
| 8 | * | ||
| 9 | * This program is free software; you can redistribute it and/or modify | ||
| 10 | * it under the terms of the GNU General Public License as published by | ||
| 11 | * the Free Software Foundation; either version 2 of the License, or | ||
| 12 | * (at your option) any later version. | ||
| 13 | * | ||
| 14 | * This program is distributed in the hope that it will be useful, but | ||
| 15 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 16 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
| 17 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
| 18 | * details. | ||
| 19 | * | ||
| 20 | * You should have received a copy of the GNU General Public License | ||
| 21 | * along with this program; if not, write to the Free Software | ||
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
| 23 | */ | ||
| 24 | |||
| 25 | #include <linux/mm.h> | ||
| 26 | #include <linux/bootmem.h> | ||
| 27 | #include <linux/mmzone.h> | ||
| 28 | #include <linux/highmem.h> | ||
| 29 | #include <linux/initrd.h> | ||
| 30 | #include <linux/nodemask.h> | ||
| 31 | #include <linux/module.h> | ||
| 32 | #include <linux/kexec.h> | ||
| 33 | #include <linux/pfn.h> | ||
| 34 | #include <linux/swap.h> | ||
| 35 | |||
| 36 | #include <asm/e820.h> | ||
| 37 | #include <asm/setup.h> | ||
| 38 | #include <asm/mmzone.h> | ||
| 39 | #include <bios_ebda.h> | ||
| 40 | |||
| 41 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | ||
| 42 | EXPORT_SYMBOL(node_data); | ||
| 43 | bootmem_data_t node0_bdata; | ||
| 44 | |||
| 45 | /* | ||
| 46 | * numa interface - we expect the numa architecture specific code to have | ||
| 47 | * populated the following initialisation. | ||
| 48 | * | ||
| 49 | * 1) node_online_map - the map of all nodes configured (online) in the system | ||
| 50 | * 2) node_start_pfn - the starting page frame number for a node | ||
| 51 | * 3) node_end_pfn - the ending page fram number for a node | ||
| 52 | */ | ||
| 53 | unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; | ||
| 54 | unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; | ||
| 55 | |||
| 56 | |||
| 57 | #ifdef CONFIG_DISCONTIGMEM | ||
| 58 | /* | ||
| 59 | * 4) physnode_map - the mapping between a pfn and owning node | ||
| 60 | * physnode_map keeps track of the physical memory layout of a generic | ||
| 61 | * numa node on a 256Mb break (each element of the array will | ||
| 62 | * represent 256Mb of memory and will be marked by the node id. so, | ||
| 63 | * if the first gig is on node 0, and the second gig is on node 1 | ||
| 64 | * physnode_map will contain: | ||
| 65 | * | ||
| 66 | * physnode_map[0-3] = 0; | ||
| 67 | * physnode_map[4-7] = 1; | ||
| 68 | * physnode_map[8- ] = -1; | ||
| 69 | */ | ||
| 70 | s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; | ||
| 71 | EXPORT_SYMBOL(physnode_map); | ||
| 72 | |||
| 73 | void memory_present(int nid, unsigned long start, unsigned long end) | ||
| 74 | { | ||
| 75 | unsigned long pfn; | ||
| 76 | |||
| 77 | printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", | ||
| 78 | nid, start, end); | ||
| 79 | printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); | ||
| 80 | printk(KERN_DEBUG " "); | ||
| 81 | for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { | ||
| 82 | physnode_map[pfn / PAGES_PER_ELEMENT] = nid; | ||
| 83 | printk("%ld ", pfn); | ||
| 84 | } | ||
| 85 | printk("\n"); | ||
| 86 | } | ||
| 87 | |||
| 88 | unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, | ||
| 89 | unsigned long end_pfn) | ||
| 90 | { | ||
| 91 | unsigned long nr_pages = end_pfn - start_pfn; | ||
| 92 | |||
| 93 | if (!nr_pages) | ||
| 94 | return 0; | ||
| 95 | |||
| 96 | return (nr_pages + 1) * sizeof(struct page); | ||
| 97 | } | ||
| 98 | #endif | ||
| 99 | |||
| 100 | extern unsigned long find_max_low_pfn(void); | ||
| 101 | extern void add_one_highpage_init(struct page *, int, int); | ||
| 102 | extern unsigned long highend_pfn, highstart_pfn; | ||
| 103 | |||
| 104 | #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) | ||
| 105 | |||
| 106 | unsigned long node_remap_start_pfn[MAX_NUMNODES]; | ||
| 107 | unsigned long node_remap_size[MAX_NUMNODES]; | ||
| 108 | unsigned long node_remap_offset[MAX_NUMNODES]; | ||
| 109 | void *node_remap_start_vaddr[MAX_NUMNODES]; | ||
| 110 | void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); | ||
| 111 | |||
| 112 | void *node_remap_end_vaddr[MAX_NUMNODES]; | ||
| 113 | void *node_remap_alloc_vaddr[MAX_NUMNODES]; | ||
| 114 | static unsigned long kva_start_pfn; | ||
| 115 | static unsigned long kva_pages; | ||
| 116 | /* | ||
| 117 | * FLAT - support for basic PC memory model with discontig enabled, essentially | ||
| 118 | * a single node with all available processors in it with a flat | ||
| 119 | * memory map. | ||
| 120 | */ | ||
| 121 | int __init get_memcfg_numa_flat(void) | ||
| 122 | { | ||
| 123 | printk("NUMA - single node, flat memory mode\n"); | ||
| 124 | |||
| 125 | /* Run the memory configuration and find the top of memory. */ | ||
| 126 | find_max_pfn(); | ||
| 127 | node_start_pfn[0] = 0; | ||
| 128 | node_end_pfn[0] = max_pfn; | ||
| 129 | memory_present(0, 0, max_pfn); | ||
| 130 | |||
| 131 | /* Indicate there is one node available. */ | ||
| 132 | nodes_clear(node_online_map); | ||
| 133 | node_set_online(0); | ||
| 134 | return 1; | ||
| 135 | } | ||
| 136 | |||
| 137 | /* | ||
| 138 | * Find the highest page frame number we have available for the node | ||
| 139 | */ | ||
| 140 | static void __init find_max_pfn_node(int nid) | ||
| 141 | { | ||
| 142 | if (node_end_pfn[nid] > max_pfn) | ||
| 143 | node_end_pfn[nid] = max_pfn; | ||
| 144 | /* | ||
| 145 | * if a user has given mem=XXXX, then we need to make sure | ||
| 146 | * that the node _starts_ before that, too, not just ends | ||
| 147 | */ | ||
| 148 | if (node_start_pfn[nid] > max_pfn) | ||
| 149 | node_start_pfn[nid] = max_pfn; | ||
| 150 | BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]); | ||
| 151 | } | ||
| 152 | |||
| 153 | /* | ||
| 154 | * Allocate memory for the pg_data_t for this node via a crude pre-bootmem | ||
| 155 | * method. For node zero take this from the bottom of memory, for | ||
| 156 | * subsequent nodes place them at node_remap_start_vaddr which contains | ||
| 157 | * node local data in physically node local memory. See setup_memory() | ||
| 158 | * for details. | ||
| 159 | */ | ||
| 160 | static void __init allocate_pgdat(int nid) | ||
| 161 | { | ||
| 162 | if (nid && node_has_online_mem(nid)) | ||
| 163 | NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; | ||
| 164 | else { | ||
| 165 | NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); | ||
| 166 | min_low_pfn += PFN_UP(sizeof(pg_data_t)); | ||
| 167 | } | ||
| 168 | } | ||
| 169 | |||
| 170 | void *alloc_remap(int nid, unsigned long size) | ||
| 171 | { | ||
| 172 | void *allocation = node_remap_alloc_vaddr[nid]; | ||
| 173 | |||
| 174 | size = ALIGN(size, L1_CACHE_BYTES); | ||
| 175 | |||
| 176 | if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) | ||
| 177 | return 0; | ||
| 178 | |||
| 179 | node_remap_alloc_vaddr[nid] += size; | ||
| 180 | memset(allocation, 0, size); | ||
| 181 | |||
| 182 | return allocation; | ||
| 183 | } | ||
| 184 | |||
| 185 | void __init remap_numa_kva(void) | ||
| 186 | { | ||
| 187 | void *vaddr; | ||
| 188 | unsigned long pfn; | ||
| 189 | int node; | ||
| 190 | |||
| 191 | for_each_online_node(node) { | ||
| 192 | for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { | ||
| 193 | vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); | ||
| 194 | set_pmd_pfn((ulong) vaddr, | ||
| 195 | node_remap_start_pfn[node] + pfn, | ||
| 196 | PAGE_KERNEL_LARGE); | ||
| 197 | } | ||
| 198 | } | ||
| 199 | } | ||
| 200 | |||
| 201 | static unsigned long calculate_numa_remap_pages(void) | ||
| 202 | { | ||
| 203 | int nid; | ||
| 204 | unsigned long size, reserve_pages = 0; | ||
| 205 | unsigned long pfn; | ||
| 206 | |||
| 207 | for_each_online_node(nid) { | ||
| 208 | unsigned old_end_pfn = node_end_pfn[nid]; | ||
| 209 | |||
| 210 | /* | ||
| 211 | * The acpi/srat node info can show hot-add memroy zones | ||
| 212 | * where memory could be added but not currently present. | ||
| 213 | */ | ||
| 214 | if (node_start_pfn[nid] > max_pfn) | ||
| 215 | continue; | ||
| 216 | if (node_end_pfn[nid] > max_pfn) | ||
| 217 | node_end_pfn[nid] = max_pfn; | ||
| 218 | |||
| 219 | /* ensure the remap includes space for the pgdat. */ | ||
| 220 | size = node_remap_size[nid] + sizeof(pg_data_t); | ||
| 221 | |||
| 222 | /* convert size to large (pmd size) pages, rounding up */ | ||
| 223 | size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; | ||
| 224 | /* now the roundup is correct, convert to PAGE_SIZE pages */ | ||
| 225 | size = size * PTRS_PER_PTE; | ||
| 226 | |||
| 227 | /* | ||
| 228 | * Validate the region we are allocating only contains valid | ||
| 229 | * pages. | ||
| 230 | */ | ||
| 231 | for (pfn = node_end_pfn[nid] - size; | ||
| 232 | pfn < node_end_pfn[nid]; pfn++) | ||
| 233 | if (!page_is_ram(pfn)) | ||
| 234 | break; | ||
| 235 | |||
| 236 | if (pfn != node_end_pfn[nid]) | ||
| 237 | size = 0; | ||
| 238 | |||
| 239 | printk("Reserving %ld pages of KVA for lmem_map of node %d\n", | ||
| 240 | size, nid); | ||
| 241 | node_remap_size[nid] = size; | ||
| 242 | node_remap_offset[nid] = reserve_pages; | ||
| 243 | reserve_pages += size; | ||
| 244 | printk("Shrinking node %d from %ld pages to %ld pages\n", | ||
| 245 | nid, node_end_pfn[nid], node_end_pfn[nid] - size); | ||
| 246 | |||
| 247 | if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { | ||
| 248 | /* | ||
| 249 | * Align node_end_pfn[] and node_remap_start_pfn[] to | ||
| 250 | * pmd boundary. remap_numa_kva will barf otherwise. | ||
| 251 | */ | ||
| 252 | printk("Shrinking node %d further by %ld pages for proper alignment\n", | ||
| 253 | nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); | ||
| 254 | size += node_end_pfn[nid] & (PTRS_PER_PTE-1); | ||
| 255 | } | ||
| 256 | |||
| 257 | node_end_pfn[nid] -= size; | ||
| 258 | node_remap_start_pfn[nid] = node_end_pfn[nid]; | ||
| 259 | shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); | ||
| 260 | } | ||
| 261 | printk("Reserving total of %ld pages for numa KVA remap\n", | ||
| 262 | reserve_pages); | ||
| 263 | return reserve_pages; | ||
| 264 | } | ||
| 265 | |||
| 266 | extern void setup_bootmem_allocator(void); | ||
| 267 | unsigned long __init setup_memory(void) | ||
| 268 | { | ||
| 269 | int nid; | ||
| 270 | unsigned long system_start_pfn, system_max_low_pfn; | ||
| 271 | |||
| 272 | /* | ||
| 273 | * When mapping a NUMA machine we allocate the node_mem_map arrays | ||
| 274 | * from node local memory. They are then mapped directly into KVA | ||
| 275 | * between zone normal and vmalloc space. Calculate the size of | ||
| 276 | * this space and use it to adjust the boundry between ZONE_NORMAL | ||
| 277 | * and ZONE_HIGHMEM. | ||
| 278 | */ | ||
| 279 | find_max_pfn(); | ||
| 280 | get_memcfg_numa(); | ||
| 281 | |||
| 282 | kva_pages = calculate_numa_remap_pages(); | ||
| 283 | |||
| 284 | /* partially used pages are not usable - thus round upwards */ | ||
| 285 | system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); | ||
| 286 | |||
| 287 | kva_start_pfn = find_max_low_pfn() - kva_pages; | ||
| 288 | |||
| 289 | #ifdef CONFIG_BLK_DEV_INITRD | ||
| 290 | /* Numa kva area is below the initrd */ | ||
| 291 | if (LOADER_TYPE && INITRD_START) | ||
| 292 | kva_start_pfn = PFN_DOWN(INITRD_START) - kva_pages; | ||
| 293 | #endif | ||
| 294 | kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1); | ||
| 295 | |||
| 296 | system_max_low_pfn = max_low_pfn = find_max_low_pfn(); | ||
| 297 | printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", | ||
| 298 | kva_start_pfn, max_low_pfn); | ||
| 299 | printk("max_pfn = %ld\n", max_pfn); | ||
| 300 | #ifdef CONFIG_HIGHMEM | ||
| 301 | highstart_pfn = highend_pfn = max_pfn; | ||
| 302 | if (max_pfn > system_max_low_pfn) | ||
| 303 | highstart_pfn = system_max_low_pfn; | ||
| 304 | printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", | ||
| 305 | pages_to_mb(highend_pfn - highstart_pfn)); | ||
| 306 | num_physpages = highend_pfn; | ||
| 307 | high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; | ||
| 308 | #else | ||
| 309 | num_physpages = system_max_low_pfn; | ||
| 310 | high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; | ||
| 311 | #endif | ||
| 312 | printk(KERN_NOTICE "%ldMB LOWMEM available.\n", | ||
| 313 | pages_to_mb(system_max_low_pfn)); | ||
| 314 | printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", | ||
| 315 | min_low_pfn, max_low_pfn, highstart_pfn); | ||
| 316 | |||
| 317 | printk("Low memory ends at vaddr %08lx\n", | ||
| 318 | (ulong) pfn_to_kaddr(max_low_pfn)); | ||
| 319 | for_each_online_node(nid) { | ||
| 320 | node_remap_start_vaddr[nid] = pfn_to_kaddr( | ||
| 321 | kva_start_pfn + node_remap_offset[nid]); | ||
| 322 | /* Init the node remap allocator */ | ||
| 323 | node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + | ||
| 324 | (node_remap_size[nid] * PAGE_SIZE); | ||
| 325 | node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + | ||
| 326 | ALIGN(sizeof(pg_data_t), PAGE_SIZE); | ||
| 327 | |||
| 328 | allocate_pgdat(nid); | ||
| 329 | printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, | ||
| 330 | (ulong) node_remap_start_vaddr[nid], | ||
| 331 | (ulong) pfn_to_kaddr(highstart_pfn | ||
| 332 | + node_remap_offset[nid] + node_remap_size[nid])); | ||
| 333 | } | ||
| 334 | printk("High memory starts at vaddr %08lx\n", | ||
| 335 | (ulong) pfn_to_kaddr(highstart_pfn)); | ||
| 336 | for_each_online_node(nid) | ||
| 337 | find_max_pfn_node(nid); | ||
| 338 | |||
| 339 | memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); | ||
| 340 | NODE_DATA(0)->bdata = &node0_bdata; | ||
| 341 | setup_bootmem_allocator(); | ||
| 342 | return max_low_pfn; | ||
| 343 | } | ||
| 344 | |||
| 345 | void __init numa_kva_reserve(void) | ||
| 346 | { | ||
| 347 | reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages)); | ||
| 348 | } | ||
| 349 | |||
| 350 | void __init zone_sizes_init(void) | ||
| 351 | { | ||
| 352 | int nid; | ||
| 353 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | ||
| 354 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | ||
| 355 | max_zone_pfns[ZONE_DMA] = | ||
| 356 | virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; | ||
| 357 | max_zone_pfns[ZONE_NORMAL] = max_low_pfn; | ||
| 358 | #ifdef CONFIG_HIGHMEM | ||
| 359 | max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; | ||
| 360 | #endif | ||
| 361 | |||
| 362 | /* If SRAT has not registered memory, register it now */ | ||
| 363 | if (find_max_pfn_with_active_regions() == 0) { | ||
| 364 | for_each_online_node(nid) { | ||
| 365 | if (node_has_online_mem(nid)) | ||
| 366 | add_active_range(nid, node_start_pfn[nid], | ||
| 367 | node_end_pfn[nid]); | ||
| 368 | } | ||
| 369 | } | ||
| 370 | |||
| 371 | free_area_init_nodes(max_zone_pfns); | ||
| 372 | return; | ||
| 373 | } | ||
| 374 | |||
| 375 | void __init set_highmem_pages_init(int bad_ppro) | ||
| 376 | { | ||
| 377 | #ifdef CONFIG_HIGHMEM | ||
| 378 | struct zone *zone; | ||
| 379 | struct page *page; | ||
| 380 | |||
| 381 | for_each_zone(zone) { | ||
| 382 | unsigned long node_pfn, zone_start_pfn, zone_end_pfn; | ||
| 383 | |||
| 384 | if (!is_highmem(zone)) | ||
| 385 | continue; | ||
| 386 | |||
| 387 | zone_start_pfn = zone->zone_start_pfn; | ||
| 388 | zone_end_pfn = zone_start_pfn + zone->spanned_pages; | ||
| 389 | |||
| 390 | printk("Initializing %s for node %d (%08lx:%08lx)\n", | ||
| 391 | zone->name, zone_to_nid(zone), | ||
| 392 | zone_start_pfn, zone_end_pfn); | ||
| 393 | |||
| 394 | for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { | ||
| 395 | if (!pfn_valid(node_pfn)) | ||
| 396 | continue; | ||
| 397 | page = pfn_to_page(node_pfn); | ||
| 398 | add_one_highpage_init(page, node_pfn, bad_ppro); | ||
| 399 | } | ||
| 400 | } | ||
| 401 | totalram_pages += totalhigh_pages; | ||
| 402 | #endif | ||
| 403 | } | ||
| 404 | |||
| 405 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
| 406 | int paddr_to_nid(u64 addr) | ||
| 407 | { | ||
| 408 | int nid; | ||
| 409 | unsigned long pfn = PFN_DOWN(addr); | ||
| 410 | |||
| 411 | for_each_node(nid) | ||
| 412 | if (node_start_pfn[nid] <= pfn && | ||
| 413 | pfn < node_end_pfn[nid]) | ||
| 414 | return nid; | ||
| 415 | |||
| 416 | return -1; | ||
| 417 | } | ||
| 418 | |||
| 419 | /* | ||
| 420 | * This function is used to ask node id BEFORE memmap and mem_section's | ||
| 421 | * initialization (pfn_to_nid() can't be used yet). | ||
| 422 | * If _PXM is not defined on ACPI's DSDT, node id must be found by this. | ||
| 423 | */ | ||
| 424 | int memory_add_physaddr_to_nid(u64 addr) | ||
| 425 | { | ||
| 426 | int nid = paddr_to_nid(addr); | ||
| 427 | return (nid >= 0) ? nid : 0; | ||
| 428 | } | ||
| 429 | |||
| 430 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | ||
| 431 | #endif | ||
diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c new file mode 100644 index 000000000000..0ce4f22a2635 --- /dev/null +++ b/arch/x86/mm/extable_32.c | |||
| @@ -0,0 +1,35 @@ | |||
| 1 | /* | ||
| 2 | * linux/arch/i386/mm/extable.c | ||
| 3 | */ | ||
| 4 | |||
| 5 | #include <linux/module.h> | ||
| 6 | #include <linux/spinlock.h> | ||
| 7 | #include <asm/uaccess.h> | ||
| 8 | |||
| 9 | int fixup_exception(struct pt_regs *regs) | ||
| 10 | { | ||
| 11 | const struct exception_table_entry *fixup; | ||
| 12 | |||
| 13 | #ifdef CONFIG_PNPBIOS | ||
| 14 | if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs))) | ||
| 15 | { | ||
| 16 | extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; | ||
| 17 | extern u32 pnp_bios_is_utter_crap; | ||
| 18 | pnp_bios_is_utter_crap = 1; | ||
| 19 | printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); | ||
| 20 | __asm__ volatile( | ||
| 21 | "movl %0, %%esp\n\t" | ||
| 22 | "jmp *%1\n\t" | ||
| 23 | : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); | ||
| 24 | panic("do_trap: can't hit this"); | ||
| 25 | } | ||
| 26 | #endif | ||
| 27 | |||
| 28 | fixup = search_exception_tables(regs->eip); | ||
| 29 | if (fixup) { | ||
| 30 | regs->eip = fixup->fixup; | ||
| 31 | return 1; | ||
| 32 | } | ||
| 33 | |||
| 34 | return 0; | ||
| 35 | } | ||
diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c new file mode 100644 index 000000000000..fcb38e7f3543 --- /dev/null +++ b/arch/x86/mm/fault_32.c | |||
| @@ -0,0 +1,657 @@ | |||
| 1 | /* | ||
| 2 | * linux/arch/i386/mm/fault.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1995 Linus Torvalds | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include <linux/signal.h> | ||
| 8 | #include <linux/sched.h> | ||
| 9 | #include <linux/kernel.h> | ||
| 10 | #include <linux/errno.h> | ||
| 11 | #include <linux/string.h> | ||
| 12 | #include <linux/types.h> | ||
| 13 | #include <linux/ptrace.h> | ||
| 14 | #include <linux/mman.h> | ||
| 15 | #include <linux/mm.h> | ||
| 16 | #include <linux/smp.h> | ||
| 17 | #include <linux/interrupt.h> | ||
| 18 | #include <linux/init.h> | ||
| 19 | #include <linux/tty.h> | ||
| 20 | #include <linux/vt_kern.h> /* For unblank_screen() */ | ||
| 21 | #include <linux/highmem.h> | ||
| 22 | #include <linux/bootmem.h> /* for max_low_pfn */ | ||
| 23 | #include <linux/vmalloc.h> | ||
| 24 | #include <linux/module.h> | ||
| 25 | #include <linux/kprobes.h> | ||
| 26 | #include <linux/uaccess.h> | ||
| 27 | #include <linux/kdebug.h> | ||
| 28 | |||
| 29 | #include <asm/system.h> | ||
| 30 | #include <asm/desc.h> | ||
| 31 | #include <asm/segment.h> | ||
| 32 | |||
| 33 | extern void die(const char *,struct pt_regs *,long); | ||
| 34 | |||
| 35 | static ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain); | ||
| 36 | |||
| 37 | int register_page_fault_notifier(struct notifier_block *nb) | ||
| 38 | { | ||
| 39 | vmalloc_sync_all(); | ||
| 40 | return atomic_notifier_chain_register(¬ify_page_fault_chain, nb); | ||
| 41 | } | ||
| 42 | EXPORT_SYMBOL_GPL(register_page_fault_notifier); | ||
| 43 | |||
| 44 | int unregister_page_fault_notifier(struct notifier_block *nb) | ||
| 45 | { | ||
| 46 | return atomic_notifier_chain_unregister(¬ify_page_fault_chain, nb); | ||
| 47 | } | ||
| 48 | EXPORT_SYMBOL_GPL(unregister_page_fault_notifier); | ||
| 49 | |||
| 50 | static inline int notify_page_fault(struct pt_regs *regs, long err) | ||
| 51 | { | ||
| 52 | struct die_args args = { | ||
| 53 | .regs = regs, | ||
| 54 | .str = "page fault", | ||
| 55 | .err = err, | ||
| 56 | .trapnr = 14, | ||
| 57 | .signr = SIGSEGV | ||
| 58 | }; | ||
| 59 | return atomic_notifier_call_chain(¬ify_page_fault_chain, | ||
| 60 | DIE_PAGE_FAULT, &args); | ||
| 61 | } | ||
| 62 | |||
| 63 | /* | ||
| 64 | * Return EIP plus the CS segment base. The segment limit is also | ||
| 65 | * adjusted, clamped to the kernel/user address space (whichever is | ||
| 66 | * appropriate), and returned in *eip_limit. | ||
| 67 | * | ||
| 68 | * The segment is checked, because it might have been changed by another | ||
| 69 | * task between the original faulting instruction and here. | ||
| 70 | * | ||
| 71 | * If CS is no longer a valid code segment, or if EIP is beyond the | ||
| 72 | * limit, or if it is a kernel address when CS is not a kernel segment, | ||
| 73 | * then the returned value will be greater than *eip_limit. | ||
| 74 | * | ||
| 75 | * This is slow, but is very rarely executed. | ||
| 76 | */ | ||
| 77 | static inline unsigned long get_segment_eip(struct pt_regs *regs, | ||
| 78 | unsigned long *eip_limit) | ||
| 79 | { | ||
| 80 | unsigned long eip = regs->eip; | ||
| 81 | unsigned seg = regs->xcs & 0xffff; | ||
| 82 | u32 seg_ar, seg_limit, base, *desc; | ||
| 83 | |||
| 84 | /* Unlikely, but must come before segment checks. */ | ||
| 85 | if (unlikely(regs->eflags & VM_MASK)) { | ||
| 86 | base = seg << 4; | ||
| 87 | *eip_limit = base + 0xffff; | ||
| 88 | return base + (eip & 0xffff); | ||
| 89 | } | ||
| 90 | |||
| 91 | /* The standard kernel/user address space limit. */ | ||
| 92 | *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg; | ||
| 93 | |||
| 94 | /* By far the most common cases. */ | ||
| 95 | if (likely(SEGMENT_IS_FLAT_CODE(seg))) | ||
| 96 | return eip; | ||
| 97 | |||
| 98 | /* Check the segment exists, is within the current LDT/GDT size, | ||
| 99 | that kernel/user (ring 0..3) has the appropriate privilege, | ||
| 100 | that it's a code segment, and get the limit. */ | ||
| 101 | __asm__ ("larl %3,%0; lsll %3,%1" | ||
| 102 | : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); | ||
| 103 | if ((~seg_ar & 0x9800) || eip > seg_limit) { | ||
| 104 | *eip_limit = 0; | ||
| 105 | return 1; /* So that returned eip > *eip_limit. */ | ||
| 106 | } | ||
| 107 | |||
| 108 | /* Get the GDT/LDT descriptor base. | ||
| 109 | When you look for races in this code remember that | ||
| 110 | LDT and other horrors are only used in user space. */ | ||
| 111 | if (seg & (1<<2)) { | ||
| 112 | /* Must lock the LDT while reading it. */ | ||
| 113 | down(¤t->mm->context.sem); | ||
| 114 | desc = current->mm->context.ldt; | ||
| 115 | desc = (void *)desc + (seg & ~7); | ||
| 116 | } else { | ||
| 117 | /* Must disable preemption while reading the GDT. */ | ||
| 118 | desc = (u32 *)get_cpu_gdt_table(get_cpu()); | ||
| 119 | desc = (void *)desc + (seg & ~7); | ||
| 120 | } | ||
| 121 | |||
| 122 | /* Decode the code segment base from the descriptor */ | ||
| 123 | base = get_desc_base((unsigned long *)desc); | ||
| 124 | |||
| 125 | if (seg & (1<<2)) { | ||
| 126 | up(¤t->mm->context.sem); | ||
| 127 | } else | ||
| 128 | put_cpu(); | ||
| 129 | |||
| 130 | /* Adjust EIP and segment limit, and clamp at the kernel limit. | ||
| 131 | It's legitimate for segments to wrap at 0xffffffff. */ | ||
| 132 | seg_limit += base; | ||
| 133 | if (seg_limit < *eip_limit && seg_limit >= base) | ||
| 134 | *eip_limit = seg_limit; | ||
| 135 | return eip + base; | ||
| 136 | } | ||
| 137 | |||
| 138 | /* | ||
| 139 | * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. | ||
| 140 | * Check that here and ignore it. | ||
| 141 | */ | ||
| 142 | static int __is_prefetch(struct pt_regs *regs, unsigned long addr) | ||
| 143 | { | ||
| 144 | unsigned long limit; | ||
| 145 | unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit); | ||
| 146 | int scan_more = 1; | ||
| 147 | int prefetch = 0; | ||
| 148 | int i; | ||
| 149 | |||
| 150 | for (i = 0; scan_more && i < 15; i++) { | ||
| 151 | unsigned char opcode; | ||
| 152 | unsigned char instr_hi; | ||
| 153 | unsigned char instr_lo; | ||
| 154 | |||
| 155 | if (instr > (unsigned char *)limit) | ||
| 156 | break; | ||
| 157 | if (probe_kernel_address(instr, opcode)) | ||
| 158 | break; | ||
| 159 | |||
| 160 | instr_hi = opcode & 0xf0; | ||
| 161 | instr_lo = opcode & 0x0f; | ||
| 162 | instr++; | ||
| 163 | |||
| 164 | switch (instr_hi) { | ||
| 165 | case 0x20: | ||
| 166 | case 0x30: | ||
| 167 | /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ | ||
| 168 | scan_more = ((instr_lo & 7) == 0x6); | ||
| 169 | break; | ||
| 170 | |||
| 171 | case 0x60: | ||
| 172 | /* 0x64 thru 0x67 are valid prefixes in all modes. */ | ||
| 173 | scan_more = (instr_lo & 0xC) == 0x4; | ||
| 174 | break; | ||
| 175 | case 0xF0: | ||
| 176 | /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ | ||
| 177 | scan_more = !instr_lo || (instr_lo>>1) == 1; | ||
| 178 | break; | ||
| 179 | case 0x00: | ||
| 180 | /* Prefetch instruction is 0x0F0D or 0x0F18 */ | ||
| 181 | scan_more = 0; | ||
| 182 | if (instr > (unsigned char *)limit) | ||
| 183 | break; | ||
| 184 | if (probe_kernel_address(instr, opcode)) | ||
| 185 | break; | ||
| 186 | prefetch = (instr_lo == 0xF) && | ||
| 187 | (opcode == 0x0D || opcode == 0x18); | ||
| 188 | break; | ||
| 189 | default: | ||
| 190 | scan_more = 0; | ||
| 191 | break; | ||
| 192 | } | ||
| 193 | } | ||
| 194 | return prefetch; | ||
| 195 | } | ||
| 196 | |||
| 197 | static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, | ||
| 198 | unsigned long error_code) | ||
| 199 | { | ||
| 200 | if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && | ||
| 201 | boot_cpu_data.x86 >= 6)) { | ||
| 202 | /* Catch an obscure case of prefetch inside an NX page. */ | ||
| 203 | if (nx_enabled && (error_code & 16)) | ||
| 204 | return 0; | ||
| 205 | return __is_prefetch(regs, addr); | ||
| 206 | } | ||
| 207 | return 0; | ||
| 208 | } | ||
| 209 | |||
| 210 | static noinline void force_sig_info_fault(int si_signo, int si_code, | ||
| 211 | unsigned long address, struct task_struct *tsk) | ||
| 212 | { | ||
| 213 | siginfo_t info; | ||
| 214 | |||
| 215 | info.si_signo = si_signo; | ||
| 216 | info.si_errno = 0; | ||
| 217 | info.si_code = si_code; | ||
| 218 | info.si_addr = (void __user *)address; | ||
| 219 | force_sig_info(si_signo, &info, tsk); | ||
| 220 | } | ||
| 221 | |||
| 222 | fastcall void do_invalid_op(struct pt_regs *, unsigned long); | ||
| 223 | |||
| 224 | static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) | ||
| 225 | { | ||
| 226 | unsigned index = pgd_index(address); | ||
| 227 | pgd_t *pgd_k; | ||
| 228 | pud_t *pud, *pud_k; | ||
| 229 | pmd_t *pmd, *pmd_k; | ||
| 230 | |||
| 231 | pgd += index; | ||
| 232 | pgd_k = init_mm.pgd + index; | ||
| 233 | |||
| 234 | if (!pgd_present(*pgd_k)) | ||
| 235 | return NULL; | ||
| 236 | |||
| 237 | /* | ||
| 238 | * set_pgd(pgd, *pgd_k); here would be useless on PAE | ||
| 239 | * and redundant with the set_pmd() on non-PAE. As would | ||
| 240 | * set_pud. | ||
| 241 | */ | ||
| 242 | |||
| 243 | pud = pud_offset(pgd, address); | ||
| 244 | pud_k = pud_offset(pgd_k, address); | ||
| 245 | if (!pud_present(*pud_k)) | ||
| 246 | return NULL; | ||
| 247 | |||
| 248 | pmd = pmd_offset(pud, address); | ||
| 249 | pmd_k = pmd_offset(pud_k, address); | ||
| 250 | if (!pmd_present(*pmd_k)) | ||
| 251 | return NULL; | ||
| 252 | if (!pmd_present(*pmd)) { | ||
| 253 | set_pmd(pmd, *pmd_k); | ||
| 254 | arch_flush_lazy_mmu_mode(); | ||
| 255 | } else | ||
| 256 | BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); | ||
| 257 | return pmd_k; | ||
| 258 | } | ||
| 259 | |||
| 260 | /* | ||
| 261 | * Handle a fault on the vmalloc or module mapping area | ||
| 262 | * | ||
| 263 | * This assumes no large pages in there. | ||
| 264 | */ | ||
| 265 | static inline int vmalloc_fault(unsigned long address) | ||
| 266 | { | ||
| 267 | unsigned long pgd_paddr; | ||
| 268 | pmd_t *pmd_k; | ||
| 269 | pte_t *pte_k; | ||
| 270 | /* | ||
| 271 | * Synchronize this task's top level page-table | ||
| 272 | * with the 'reference' page table. | ||
| 273 | * | ||
| 274 | * Do _not_ use "current" here. We might be inside | ||
| 275 | * an interrupt in the middle of a task switch.. | ||
| 276 | */ | ||
| 277 | pgd_paddr = read_cr3(); | ||
| 278 | pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); | ||
| 279 | if (!pmd_k) | ||
| 280 | return -1; | ||
| 281 | pte_k = pte_offset_kernel(pmd_k, address); | ||
| 282 | if (!pte_present(*pte_k)) | ||
| 283 | return -1; | ||
| 284 | return 0; | ||
| 285 | } | ||
| 286 | |||
| 287 | int show_unhandled_signals = 1; | ||
| 288 | |||
| 289 | /* | ||
| 290 | * This routine handles page faults. It determines the address, | ||
| 291 | * and the problem, and then passes it off to one of the appropriate | ||
| 292 | * routines. | ||
| 293 | * | ||
| 294 | * error_code: | ||
| 295 | * bit 0 == 0 means no page found, 1 means protection fault | ||
| 296 | * bit 1 == 0 means read, 1 means write | ||
| 297 | * bit 2 == 0 means kernel, 1 means user-mode | ||
| 298 | * bit 3 == 1 means use of reserved bit detected | ||
| 299 | * bit 4 == 1 means fault was an instruction fetch | ||
| 300 | */ | ||
| 301 | fastcall void __kprobes do_page_fault(struct pt_regs *regs, | ||
| 302 | unsigned long error_code) | ||
| 303 | { | ||
| 304 | struct task_struct *tsk; | ||
| 305 | struct mm_struct *mm; | ||
| 306 | struct vm_area_struct * vma; | ||
| 307 | unsigned long address; | ||
| 308 | int write, si_code; | ||
| 309 | int fault; | ||
| 310 | |||
| 311 | /* get the address */ | ||
| 312 | address = read_cr2(); | ||
| 313 | |||
| 314 | tsk = current; | ||
| 315 | |||
| 316 | si_code = SEGV_MAPERR; | ||
| 317 | |||
| 318 | /* | ||
| 319 | * We fault-in kernel-space virtual memory on-demand. The | ||
| 320 | * 'reference' page table is init_mm.pgd. | ||
| 321 | * | ||
| 322 | * NOTE! We MUST NOT take any locks for this case. We may | ||
| 323 | * be in an interrupt or a critical region, and should | ||
| 324 | * only copy the information from the master page table, | ||
| 325 | * nothing more. | ||
| 326 | * | ||
| 327 | * This verifies that the fault happens in kernel space | ||
| 328 | * (error_code & 4) == 0, and that the fault was not a | ||
| 329 | * protection error (error_code & 9) == 0. | ||
| 330 | */ | ||
| 331 | if (unlikely(address >= TASK_SIZE)) { | ||
| 332 | if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) | ||
| 333 | return; | ||
| 334 | if (notify_page_fault(regs, error_code) == NOTIFY_STOP) | ||
| 335 | return; | ||
| 336 | /* | ||
| 337 | * Don't take the mm semaphore here. If we fixup a prefetch | ||
| 338 | * fault we could otherwise deadlock. | ||
| 339 | */ | ||
| 340 | goto bad_area_nosemaphore; | ||
| 341 | } | ||
| 342 | |||
| 343 | if (notify_page_fault(regs, error_code) == NOTIFY_STOP) | ||
| 344 | return; | ||
| 345 | |||
| 346 | /* It's safe to allow irq's after cr2 has been saved and the vmalloc | ||
| 347 | fault has been handled. */ | ||
| 348 | if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) | ||
| 349 | local_irq_enable(); | ||
| 350 | |||
| 351 | mm = tsk->mm; | ||
| 352 | |||
| 353 | /* | ||
| 354 | * If we're in an interrupt, have no user context or are running in an | ||
| 355 | * atomic region then we must not take the fault.. | ||
| 356 | */ | ||
| 357 | if (in_atomic() || !mm) | ||
| 358 | goto bad_area_nosemaphore; | ||
| 359 | |||
| 360 | /* When running in the kernel we expect faults to occur only to | ||
| 361 | * addresses in user space. All other faults represent errors in the | ||
| 362 | * kernel and should generate an OOPS. Unfortunatly, in the case of an | ||
| 363 | * erroneous fault occurring in a code path which already holds mmap_sem | ||
| 364 | * we will deadlock attempting to validate the fault against the | ||
| 365 | * address space. Luckily the kernel only validly references user | ||
| 366 | * space from well defined areas of code, which are listed in the | ||
| 367 | * exceptions table. | ||
| 368 | * | ||
| 369 | * As the vast majority of faults will be valid we will only perform | ||
| 370 | * the source reference check when there is a possibilty of a deadlock. | ||
| 371 | * Attempt to lock the address space, if we cannot we then validate the | ||
| 372 | * source. If this is invalid we can skip the address space check, | ||
| 373 | * thus avoiding the deadlock. | ||
| 374 | */ | ||
| 375 | if (!down_read_trylock(&mm->mmap_sem)) { | ||
| 376 | if ((error_code & 4) == 0 && | ||
| 377 | !search_exception_tables(regs->eip)) | ||
| 378 | goto bad_area_nosemaphore; | ||
| 379 | down_read(&mm->mmap_sem); | ||
| 380 | } | ||
| 381 | |||
| 382 | vma = find_vma(mm, address); | ||
| 383 | if (!vma) | ||
| 384 | goto bad_area; | ||
| 385 | if (vma->vm_start <= address) | ||
| 386 | goto good_area; | ||
| 387 | if (!(vma->vm_flags & VM_GROWSDOWN)) | ||
| 388 | goto bad_area; | ||
| 389 | if (error_code & 4) { | ||
| 390 | /* | ||
| 391 | * Accessing the stack below %esp is always a bug. | ||
| 392 | * The large cushion allows instructions like enter | ||
| 393 | * and pusha to work. ("enter $65535,$31" pushes | ||
| 394 | * 32 pointers and then decrements %esp by 65535.) | ||
| 395 | */ | ||
| 396 | if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp) | ||
| 397 | goto bad_area; | ||
| 398 | } | ||
| 399 | if (expand_stack(vma, address)) | ||
| 400 | goto bad_area; | ||
| 401 | /* | ||
| 402 | * Ok, we have a good vm_area for this memory access, so | ||
| 403 | * we can handle it.. | ||
| 404 | */ | ||
| 405 | good_area: | ||
| 406 | si_code = SEGV_ACCERR; | ||
| 407 | write = 0; | ||
| 408 | switch (error_code & 3) { | ||
| 409 | default: /* 3: write, present */ | ||
| 410 | /* fall through */ | ||
| 411 | case 2: /* write, not present */ | ||
| 412 | if (!(vma->vm_flags & VM_WRITE)) | ||
| 413 | goto bad_area; | ||
| 414 | write++; | ||
| 415 | break; | ||
| 416 | case 1: /* read, present */ | ||
| 417 | goto bad_area; | ||
| 418 | case 0: /* read, not present */ | ||
| 419 | if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) | ||
| 420 | goto bad_area; | ||
| 421 | } | ||
| 422 | |||
| 423 | survive: | ||
| 424 | /* | ||
| 425 | * If for any reason at all we couldn't handle the fault, | ||
| 426 | * make sure we exit gracefully rather than endlessly redo | ||
| 427 | * the fault. | ||
| 428 | */ | ||
| 429 | fault = handle_mm_fault(mm, vma, address, write); | ||
| 430 | if (unlikely(fault & VM_FAULT_ERROR)) { | ||
| 431 | if (fault & VM_FAULT_OOM) | ||
| 432 | goto out_of_memory; | ||
| 433 | else if (fault & VM_FAULT_SIGBUS) | ||
| 434 | goto do_sigbus; | ||
| 435 | BUG(); | ||
| 436 | } | ||
| 437 | if (fault & VM_FAULT_MAJOR) | ||
| 438 | tsk->maj_flt++; | ||
| 439 | else | ||
| 440 | tsk->min_flt++; | ||
| 441 | |||
| 442 | /* | ||
| 443 | * Did it hit the DOS screen memory VA from vm86 mode? | ||
| 444 | */ | ||
| 445 | if (regs->eflags & VM_MASK) { | ||
| 446 | unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; | ||
| 447 | if (bit < 32) | ||
| 448 | tsk->thread.screen_bitmap |= 1 << bit; | ||
| 449 | } | ||
| 450 | up_read(&mm->mmap_sem); | ||
| 451 | return; | ||
| 452 | |||
| 453 | /* | ||
| 454 | * Something tried to access memory that isn't in our memory map.. | ||
| 455 | * Fix it, but check if it's kernel or user first.. | ||
| 456 | */ | ||
| 457 | bad_area: | ||
| 458 | up_read(&mm->mmap_sem); | ||
| 459 | |||
| 460 | bad_area_nosemaphore: | ||
| 461 | /* User mode accesses just cause a SIGSEGV */ | ||
| 462 | if (error_code & 4) { | ||
| 463 | /* | ||
| 464 | * It's possible to have interrupts off here. | ||
| 465 | */ | ||
| 466 | local_irq_enable(); | ||
| 467 | |||
| 468 | /* | ||
| 469 | * Valid to do another page fault here because this one came | ||
| 470 | * from user space. | ||
| 471 | */ | ||
| 472 | if (is_prefetch(regs, address, error_code)) | ||
| 473 | return; | ||
| 474 | |||
| 475 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | ||
| 476 | printk_ratelimit()) { | ||
| 477 | printk("%s%s[%d]: segfault at %08lx eip %08lx " | ||
| 478 | "esp %08lx error %lx\n", | ||
| 479 | tsk->pid > 1 ? KERN_INFO : KERN_EMERG, | ||
| 480 | tsk->comm, tsk->pid, address, regs->eip, | ||
| 481 | regs->esp, error_code); | ||
| 482 | } | ||
| 483 | tsk->thread.cr2 = address; | ||
| 484 | /* Kernel addresses are always protection faults */ | ||
| 485 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | ||
| 486 | tsk->thread.trap_no = 14; | ||
| 487 | force_sig_info_fault(SIGSEGV, si_code, address, tsk); | ||
| 488 | return; | ||
| 489 | } | ||
| 490 | |||
| 491 | #ifdef CONFIG_X86_F00F_BUG | ||
| 492 | /* | ||
| 493 | * Pentium F0 0F C7 C8 bug workaround. | ||
| 494 | */ | ||
| 495 | if (boot_cpu_data.f00f_bug) { | ||
| 496 | unsigned long nr; | ||
| 497 | |||
| 498 | nr = (address - idt_descr.address) >> 3; | ||
| 499 | |||
| 500 | if (nr == 6) { | ||
| 501 | do_invalid_op(regs, 0); | ||
| 502 | return; | ||
| 503 | } | ||
| 504 | } | ||
| 505 | #endif | ||
| 506 | |||
| 507 | no_context: | ||
| 508 | /* Are we prepared to handle this kernel fault? */ | ||
| 509 | if (fixup_exception(regs)) | ||
| 510 | return; | ||
| 511 | |||
| 512 | /* | ||
| 513 | * Valid to do another page fault here, because if this fault | ||
| 514 | * had been triggered by is_prefetch fixup_exception would have | ||
| 515 | * handled it. | ||
| 516 | */ | ||
| 517 | if (is_prefetch(regs, address, error_code)) | ||
| 518 | return; | ||
| 519 | |||
| 520 | /* | ||
| 521 | * Oops. The kernel tried to access some bad page. We'll have to | ||
| 522 | * terminate things with extreme prejudice. | ||
| 523 | */ | ||
| 524 | |||
| 525 | bust_spinlocks(1); | ||
| 526 | |||
| 527 | if (oops_may_print()) { | ||
| 528 | __typeof__(pte_val(__pte(0))) page; | ||
| 529 | |||
| 530 | #ifdef CONFIG_X86_PAE | ||
| 531 | if (error_code & 16) { | ||
| 532 | pte_t *pte = lookup_address(address); | ||
| 533 | |||
| 534 | if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) | ||
| 535 | printk(KERN_CRIT "kernel tried to execute " | ||
| 536 | "NX-protected page - exploit attempt? " | ||
| 537 | "(uid: %d)\n", current->uid); | ||
| 538 | } | ||
| 539 | #endif | ||
| 540 | if (address < PAGE_SIZE) | ||
| 541 | printk(KERN_ALERT "BUG: unable to handle kernel NULL " | ||
| 542 | "pointer dereference"); | ||
| 543 | else | ||
| 544 | printk(KERN_ALERT "BUG: unable to handle kernel paging" | ||
| 545 | " request"); | ||
| 546 | printk(" at virtual address %08lx\n",address); | ||
| 547 | printk(KERN_ALERT " printing eip:\n"); | ||
| 548 | printk("%08lx\n", regs->eip); | ||
| 549 | |||
| 550 | page = read_cr3(); | ||
| 551 | page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; | ||
| 552 | #ifdef CONFIG_X86_PAE | ||
| 553 | printk(KERN_ALERT "*pdpt = %016Lx\n", page); | ||
| 554 | if ((page >> PAGE_SHIFT) < max_low_pfn | ||
| 555 | && page & _PAGE_PRESENT) { | ||
| 556 | page &= PAGE_MASK; | ||
| 557 | page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) | ||
| 558 | & (PTRS_PER_PMD - 1)]; | ||
| 559 | printk(KERN_ALERT "*pde = %016Lx\n", page); | ||
| 560 | page &= ~_PAGE_NX; | ||
| 561 | } | ||
| 562 | #else | ||
| 563 | printk(KERN_ALERT "*pde = %08lx\n", page); | ||
| 564 | #endif | ||
| 565 | |||
| 566 | /* | ||
| 567 | * We must not directly access the pte in the highpte | ||
| 568 | * case if the page table is located in highmem. | ||
| 569 | * And let's rather not kmap-atomic the pte, just in case | ||
| 570 | * it's allocated already. | ||
| 571 | */ | ||
| 572 | if ((page >> PAGE_SHIFT) < max_low_pfn | ||
| 573 | && (page & _PAGE_PRESENT)) { | ||
| 574 | page &= PAGE_MASK; | ||
| 575 | page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) | ||
| 576 | & (PTRS_PER_PTE - 1)]; | ||
| 577 | printk(KERN_ALERT "*pte = %0*Lx\n", sizeof(page)*2, (u64)page); | ||
| 578 | } | ||
| 579 | } | ||
| 580 | |||
| 581 | tsk->thread.cr2 = address; | ||
| 582 | tsk->thread.trap_no = 14; | ||
| 583 | tsk->thread.error_code = error_code; | ||
| 584 | die("Oops", regs, error_code); | ||
| 585 | bust_spinlocks(0); | ||
| 586 | do_exit(SIGKILL); | ||
| 587 | |||
| 588 | /* | ||
| 589 | * We ran out of memory, or some other thing happened to us that made | ||
| 590 | * us unable to handle the page fault gracefully. | ||
| 591 | */ | ||
| 592 | out_of_memory: | ||
| 593 | up_read(&mm->mmap_sem); | ||
| 594 | if (is_init(tsk)) { | ||
| 595 | yield(); | ||
| 596 | down_read(&mm->mmap_sem); | ||
| 597 | goto survive; | ||
| 598 | } | ||
| 599 | printk("VM: killing process %s\n", tsk->comm); | ||
| 600 | if (error_code & 4) | ||
| 601 | do_exit(SIGKILL); | ||
| 602 | goto no_context; | ||
| 603 | |||
| 604 | do_sigbus: | ||
| 605 | up_read(&mm->mmap_sem); | ||
| 606 | |||
| 607 | /* Kernel mode? Handle exceptions or die */ | ||
| 608 | if (!(error_code & 4)) | ||
| 609 | goto no_context; | ||
| 610 | |||
| 611 | /* User space => ok to do another page fault */ | ||
| 612 | if (is_prefetch(regs, address, error_code)) | ||
| 613 | return; | ||
| 614 | |||
| 615 | tsk->thread.cr2 = address; | ||
| 616 | tsk->thread.error_code = error_code; | ||
| 617 | tsk->thread.trap_no = 14; | ||
| 618 | force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); | ||
| 619 | } | ||
| 620 | |||
| 621 | void vmalloc_sync_all(void) | ||
| 622 | { | ||
| 623 | /* | ||
| 624 | * Note that races in the updates of insync and start aren't | ||
| 625 | * problematic: insync can only get set bits added, and updates to | ||
| 626 | * start are only improving performance (without affecting correctness | ||
| 627 | * if undone). | ||
| 628 | */ | ||
| 629 | static DECLARE_BITMAP(insync, PTRS_PER_PGD); | ||
| 630 | static unsigned long start = TASK_SIZE; | ||
| 631 | unsigned long address; | ||
| 632 | |||
| 633 | if (SHARED_KERNEL_PMD) | ||
| 634 | return; | ||
| 635 | |||
| 636 | BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); | ||
| 637 | for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { | ||
| 638 | if (!test_bit(pgd_index(address), insync)) { | ||
| 639 | unsigned long flags; | ||
| 640 | struct page *page; | ||
| 641 | |||
| 642 | spin_lock_irqsave(&pgd_lock, flags); | ||
| 643 | for (page = pgd_list; page; page = | ||
| 644 | (struct page *)page->index) | ||
| 645 | if (!vmalloc_sync_one(page_address(page), | ||
| 646 | address)) { | ||
| 647 | BUG_ON(page != pgd_list); | ||
| 648 | break; | ||
| 649 | } | ||
| 650 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
| 651 | if (!page) | ||
| 652 | set_bit(pgd_index(address), insync); | ||
| 653 | } | ||
| 654 | if (address == start && test_bit(pgd_index(address), insync)) | ||
| 655 | start = address + PGDIR_SIZE; | ||
| 656 | } | ||
| 657 | } | ||
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c new file mode 100644 index 000000000000..1c3bf95f7356 --- /dev/null +++ b/arch/x86/mm/highmem_32.c | |||
| @@ -0,0 +1,113 @@ | |||
| 1 | #include <linux/highmem.h> | ||
| 2 | #include <linux/module.h> | ||
| 3 | |||
| 4 | void *kmap(struct page *page) | ||
| 5 | { | ||
| 6 | might_sleep(); | ||
| 7 | if (!PageHighMem(page)) | ||
| 8 | return page_address(page); | ||
| 9 | return kmap_high(page); | ||
| 10 | } | ||
| 11 | |||
| 12 | void kunmap(struct page *page) | ||
| 13 | { | ||
| 14 | if (in_interrupt()) | ||
| 15 | BUG(); | ||
| 16 | if (!PageHighMem(page)) | ||
| 17 | return; | ||
| 18 | kunmap_high(page); | ||
| 19 | } | ||
| 20 | |||
| 21 | /* | ||
| 22 | * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because | ||
| 23 | * no global lock is needed and because the kmap code must perform a global TLB | ||
| 24 | * invalidation when the kmap pool wraps. | ||
| 25 | * | ||
| 26 | * However when holding an atomic kmap is is not legal to sleep, so atomic | ||
| 27 | * kmaps are appropriate for short, tight code paths only. | ||
| 28 | */ | ||
| 29 | void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) | ||
| 30 | { | ||
| 31 | enum fixed_addresses idx; | ||
| 32 | unsigned long vaddr; | ||
| 33 | |||
| 34 | /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ | ||
| 35 | pagefault_disable(); | ||
| 36 | |||
| 37 | if (!PageHighMem(page)) | ||
| 38 | return page_address(page); | ||
| 39 | |||
| 40 | idx = type + KM_TYPE_NR*smp_processor_id(); | ||
| 41 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | ||
| 42 | BUG_ON(!pte_none(*(kmap_pte-idx))); | ||
| 43 | set_pte(kmap_pte-idx, mk_pte(page, prot)); | ||
| 44 | arch_flush_lazy_mmu_mode(); | ||
| 45 | |||
| 46 | return (void *)vaddr; | ||
| 47 | } | ||
| 48 | |||
| 49 | void *kmap_atomic(struct page *page, enum km_type type) | ||
| 50 | { | ||
| 51 | return kmap_atomic_prot(page, type, kmap_prot); | ||
| 52 | } | ||
| 53 | |||
| 54 | void kunmap_atomic(void *kvaddr, enum km_type type) | ||
| 55 | { | ||
| 56 | unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; | ||
| 57 | enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); | ||
| 58 | |||
| 59 | /* | ||
| 60 | * Force other mappings to Oops if they'll try to access this pte | ||
| 61 | * without first remap it. Keeping stale mappings around is a bad idea | ||
| 62 | * also, in case the page changes cacheability attributes or becomes | ||
| 63 | * a protected page in a hypervisor. | ||
| 64 | */ | ||
| 65 | if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) | ||
| 66 | kpte_clear_flush(kmap_pte-idx, vaddr); | ||
| 67 | else { | ||
| 68 | #ifdef CONFIG_DEBUG_HIGHMEM | ||
| 69 | BUG_ON(vaddr < PAGE_OFFSET); | ||
| 70 | BUG_ON(vaddr >= (unsigned long)high_memory); | ||
| 71 | #endif | ||
| 72 | } | ||
| 73 | |||
| 74 | arch_flush_lazy_mmu_mode(); | ||
| 75 | pagefault_enable(); | ||
| 76 | } | ||
| 77 | |||
| 78 | /* This is the same as kmap_atomic() but can map memory that doesn't | ||
| 79 | * have a struct page associated with it. | ||
| 80 | */ | ||
| 81 | void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) | ||
| 82 | { | ||
| 83 | enum fixed_addresses idx; | ||
| 84 | unsigned long vaddr; | ||
| 85 | |||
| 86 | pagefault_disable(); | ||
| 87 | |||
| 88 | idx = type + KM_TYPE_NR*smp_processor_id(); | ||
| 89 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | ||
| 90 | set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot)); | ||
| 91 | arch_flush_lazy_mmu_mode(); | ||
| 92 | |||
| 93 | return (void*) vaddr; | ||
| 94 | } | ||
| 95 | |||
| 96 | struct page *kmap_atomic_to_page(void *ptr) | ||
| 97 | { | ||
| 98 | unsigned long idx, vaddr = (unsigned long)ptr; | ||
| 99 | pte_t *pte; | ||
| 100 | |||
| 101 | if (vaddr < FIXADDR_START) | ||
| 102 | return virt_to_page(ptr); | ||
| 103 | |||
| 104 | idx = virt_to_fix(vaddr); | ||
| 105 | pte = kmap_pte - (idx - FIX_KMAP_BEGIN); | ||
| 106 | return pte_page(*pte); | ||
| 107 | } | ||
| 108 | |||
| 109 | EXPORT_SYMBOL(kmap); | ||
| 110 | EXPORT_SYMBOL(kunmap); | ||
| 111 | EXPORT_SYMBOL(kmap_atomic); | ||
| 112 | EXPORT_SYMBOL(kunmap_atomic); | ||
| 113 | EXPORT_SYMBOL(kmap_atomic_to_page); | ||
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c new file mode 100644 index 000000000000..6c06d9c0488e --- /dev/null +++ b/arch/x86/mm/hugetlbpage.c | |||
| @@ -0,0 +1,391 @@ | |||
| 1 | /* | ||
| 2 | * IA-32 Huge TLB Page Support for Kernel. | ||
| 3 | * | ||
| 4 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include <linux/init.h> | ||
| 8 | #include <linux/fs.h> | ||
| 9 | #include <linux/mm.h> | ||
| 10 | #include <linux/hugetlb.h> | ||
| 11 | #include <linux/pagemap.h> | ||
| 12 | #include <linux/slab.h> | ||
| 13 | #include <linux/err.h> | ||
| 14 | #include <linux/sysctl.h> | ||
| 15 | #include <asm/mman.h> | ||
| 16 | #include <asm/tlb.h> | ||
| 17 | #include <asm/tlbflush.h> | ||
| 18 | |||
| 19 | static unsigned long page_table_shareable(struct vm_area_struct *svma, | ||
| 20 | struct vm_area_struct *vma, | ||
| 21 | unsigned long addr, pgoff_t idx) | ||
| 22 | { | ||
| 23 | unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + | ||
| 24 | svma->vm_start; | ||
| 25 | unsigned long sbase = saddr & PUD_MASK; | ||
| 26 | unsigned long s_end = sbase + PUD_SIZE; | ||
| 27 | |||
| 28 | /* | ||
| 29 | * match the virtual addresses, permission and the alignment of the | ||
| 30 | * page table page. | ||
| 31 | */ | ||
| 32 | if (pmd_index(addr) != pmd_index(saddr) || | ||
| 33 | vma->vm_flags != svma->vm_flags || | ||
| 34 | sbase < svma->vm_start || svma->vm_end < s_end) | ||
| 35 | return 0; | ||
| 36 | |||
| 37 | return saddr; | ||
| 38 | } | ||
| 39 | |||
| 40 | static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) | ||
| 41 | { | ||
| 42 | unsigned long base = addr & PUD_MASK; | ||
| 43 | unsigned long end = base + PUD_SIZE; | ||
| 44 | |||
| 45 | /* | ||
| 46 | * check on proper vm_flags and page table alignment | ||
| 47 | */ | ||
| 48 | if (vma->vm_flags & VM_MAYSHARE && | ||
| 49 | vma->vm_start <= base && end <= vma->vm_end) | ||
| 50 | return 1; | ||
| 51 | return 0; | ||
| 52 | } | ||
| 53 | |||
| 54 | /* | ||
| 55 | * search for a shareable pmd page for hugetlb. | ||
| 56 | */ | ||
| 57 | static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | ||
| 58 | { | ||
| 59 | struct vm_area_struct *vma = find_vma(mm, addr); | ||
| 60 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
| 61 | pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + | ||
| 62 | vma->vm_pgoff; | ||
| 63 | struct prio_tree_iter iter; | ||
| 64 | struct vm_area_struct *svma; | ||
| 65 | unsigned long saddr; | ||
| 66 | pte_t *spte = NULL; | ||
| 67 | |||
| 68 | if (!vma_shareable(vma, addr)) | ||
| 69 | return; | ||
| 70 | |||
| 71 | spin_lock(&mapping->i_mmap_lock); | ||
| 72 | vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { | ||
| 73 | if (svma == vma) | ||
| 74 | continue; | ||
| 75 | |||
| 76 | saddr = page_table_shareable(svma, vma, addr, idx); | ||
| 77 | if (saddr) { | ||
| 78 | spte = huge_pte_offset(svma->vm_mm, saddr); | ||
| 79 | if (spte) { | ||
| 80 | get_page(virt_to_page(spte)); | ||
| 81 | break; | ||
| 82 | } | ||
| 83 | } | ||
| 84 | } | ||
| 85 | |||
| 86 | if (!spte) | ||
| 87 | goto out; | ||
| 88 | |||
| 89 | spin_lock(&mm->page_table_lock); | ||
| 90 | if (pud_none(*pud)) | ||
| 91 | pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK); | ||
| 92 | else | ||
| 93 | put_page(virt_to_page(spte)); | ||
| 94 | spin_unlock(&mm->page_table_lock); | ||
| 95 | out: | ||
| 96 | spin_unlock(&mapping->i_mmap_lock); | ||
| 97 | } | ||
| 98 | |||
| 99 | /* | ||
| 100 | * unmap huge page backed by shared pte. | ||
| 101 | * | ||
| 102 | * Hugetlb pte page is ref counted at the time of mapping. If pte is shared | ||
| 103 | * indicated by page_count > 1, unmap is achieved by clearing pud and | ||
| 104 | * decrementing the ref count. If count == 1, the pte page is not shared. | ||
| 105 | * | ||
| 106 | * called with vma->vm_mm->page_table_lock held. | ||
| 107 | * | ||
| 108 | * returns: 1 successfully unmapped a shared pte page | ||
| 109 | * 0 the underlying pte page is not shared, or it is the last user | ||
| 110 | */ | ||
| 111 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | ||
| 112 | { | ||
| 113 | pgd_t *pgd = pgd_offset(mm, *addr); | ||
| 114 | pud_t *pud = pud_offset(pgd, *addr); | ||
| 115 | |||
| 116 | BUG_ON(page_count(virt_to_page(ptep)) == 0); | ||
| 117 | if (page_count(virt_to_page(ptep)) == 1) | ||
| 118 | return 0; | ||
| 119 | |||
| 120 | pud_clear(pud); | ||
| 121 | put_page(virt_to_page(ptep)); | ||
| 122 | *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; | ||
| 123 | return 1; | ||
| 124 | } | ||
| 125 | |||
| 126 | pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) | ||
| 127 | { | ||
| 128 | pgd_t *pgd; | ||
| 129 | pud_t *pud; | ||
| 130 | pte_t *pte = NULL; | ||
| 131 | |||
| 132 | pgd = pgd_offset(mm, addr); | ||
| 133 | pud = pud_alloc(mm, pgd, addr); | ||
| 134 | if (pud) { | ||
| 135 | if (pud_none(*pud)) | ||
| 136 | huge_pmd_share(mm, addr, pud); | ||
| 137 | pte = (pte_t *) pmd_alloc(mm, pud, addr); | ||
| 138 | } | ||
| 139 | BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); | ||
| 140 | |||
| 141 | return pte; | ||
| 142 | } | ||
| 143 | |||
| 144 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | ||
| 145 | { | ||
| 146 | pgd_t *pgd; | ||
| 147 | pud_t *pud; | ||
| 148 | pmd_t *pmd = NULL; | ||
| 149 | |||
| 150 | pgd = pgd_offset(mm, addr); | ||
| 151 | if (pgd_present(*pgd)) { | ||
| 152 | pud = pud_offset(pgd, addr); | ||
| 153 | if (pud_present(*pud)) | ||
| 154 | pmd = pmd_offset(pud, addr); | ||
| 155 | } | ||
| 156 | return (pte_t *) pmd; | ||
| 157 | } | ||
| 158 | |||
| 159 | #if 0 /* This is just for testing */ | ||
| 160 | struct page * | ||
| 161 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | ||
| 162 | { | ||
| 163 | unsigned long start = address; | ||
| 164 | int length = 1; | ||
| 165 | int nr; | ||
| 166 | struct page *page; | ||
| 167 | struct vm_area_struct *vma; | ||
| 168 | |||
| 169 | vma = find_vma(mm, addr); | ||
| 170 | if (!vma || !is_vm_hugetlb_page(vma)) | ||
| 171 | return ERR_PTR(-EINVAL); | ||
| 172 | |||
| 173 | pte = huge_pte_offset(mm, address); | ||
| 174 | |||
| 175 | /* hugetlb should be locked, and hence, prefaulted */ | ||
| 176 | WARN_ON(!pte || pte_none(*pte)); | ||
| 177 | |||
| 178 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | ||
| 179 | |||
| 180 | WARN_ON(!PageCompound(page)); | ||
| 181 | |||
| 182 | return page; | ||
| 183 | } | ||
| 184 | |||
| 185 | int pmd_huge(pmd_t pmd) | ||
| 186 | { | ||
| 187 | return 0; | ||
| 188 | } | ||
| 189 | |||
| 190 | struct page * | ||
| 191 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
| 192 | pmd_t *pmd, int write) | ||
| 193 | { | ||
| 194 | return NULL; | ||
| 195 | } | ||
| 196 | |||
| 197 | #else | ||
| 198 | |||
| 199 | struct page * | ||
| 200 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | ||
| 201 | { | ||
| 202 | return ERR_PTR(-EINVAL); | ||
| 203 | } | ||
| 204 | |||
| 205 | int pmd_huge(pmd_t pmd) | ||
| 206 | { | ||
| 207 | return !!(pmd_val(pmd) & _PAGE_PSE); | ||
| 208 | } | ||
| 209 | |||
| 210 | struct page * | ||
| 211 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
| 212 | pmd_t *pmd, int write) | ||
| 213 | { | ||
| 214 | struct page *page; | ||
| 215 | |||
| 216 | page = pte_page(*(pte_t *)pmd); | ||
| 217 | if (page) | ||
| 218 | page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); | ||
| 219 | return page; | ||
| 220 | } | ||
| 221 | #endif | ||
| 222 | |||
| 223 | /* x86_64 also uses this file */ | ||
| 224 | |||
| 225 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA | ||
| 226 | static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, | ||
| 227 | unsigned long addr, unsigned long len, | ||
| 228 | unsigned long pgoff, unsigned long flags) | ||
| 229 | { | ||
| 230 | struct mm_struct *mm = current->mm; | ||
| 231 | struct vm_area_struct *vma; | ||
| 232 | unsigned long start_addr; | ||
| 233 | |||
| 234 | if (len > mm->cached_hole_size) { | ||
| 235 | start_addr = mm->free_area_cache; | ||
| 236 | } else { | ||
| 237 | start_addr = TASK_UNMAPPED_BASE; | ||
| 238 | mm->cached_hole_size = 0; | ||
| 239 | } | ||
| 240 | |||
| 241 | full_search: | ||
| 242 | addr = ALIGN(start_addr, HPAGE_SIZE); | ||
| 243 | |||
| 244 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | ||
| 245 | /* At this point: (!vma || addr < vma->vm_end). */ | ||
| 246 | if (TASK_SIZE - len < addr) { | ||
| 247 | /* | ||
| 248 | * Start a new search - just in case we missed | ||
| 249 | * some holes. | ||
| 250 | */ | ||
| 251 | if (start_addr != TASK_UNMAPPED_BASE) { | ||
| 252 | start_addr = TASK_UNMAPPED_BASE; | ||
| 253 | mm->cached_hole_size = 0; | ||
| 254 | goto full_search; | ||
| 255 | } | ||
| 256 | return -ENOMEM; | ||
| 257 | } | ||
| 258 | if (!vma || addr + len <= vma->vm_start) { | ||
| 259 | mm->free_area_cache = addr + len; | ||
| 260 | return addr; | ||
| 261 | } | ||
| 262 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
| 263 | mm->cached_hole_size = vma->vm_start - addr; | ||
| 264 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | ||
| 265 | } | ||
| 266 | } | ||
| 267 | |||
| 268 | static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | ||
| 269 | unsigned long addr0, unsigned long len, | ||
| 270 | unsigned long pgoff, unsigned long flags) | ||
| 271 | { | ||
| 272 | struct mm_struct *mm = current->mm; | ||
| 273 | struct vm_area_struct *vma, *prev_vma; | ||
| 274 | unsigned long base = mm->mmap_base, addr = addr0; | ||
| 275 | unsigned long largest_hole = mm->cached_hole_size; | ||
| 276 | int first_time = 1; | ||
| 277 | |||
| 278 | /* don't allow allocations above current base */ | ||
| 279 | if (mm->free_area_cache > base) | ||
| 280 | mm->free_area_cache = base; | ||
| 281 | |||
| 282 | if (len <= largest_hole) { | ||
| 283 | largest_hole = 0; | ||
| 284 | mm->free_area_cache = base; | ||
| 285 | } | ||
| 286 | try_again: | ||
| 287 | /* make sure it can fit in the remaining address space */ | ||
| 288 | if (mm->free_area_cache < len) | ||
| 289 | goto fail; | ||
| 290 | |||
| 291 | /* either no address requested or cant fit in requested address hole */ | ||
| 292 | addr = (mm->free_area_cache - len) & HPAGE_MASK; | ||
| 293 | do { | ||
| 294 | /* | ||
| 295 | * Lookup failure means no vma is above this address, | ||
| 296 | * i.e. return with success: | ||
| 297 | */ | ||
| 298 | if (!(vma = find_vma_prev(mm, addr, &prev_vma))) | ||
| 299 | return addr; | ||
| 300 | |||
| 301 | /* | ||
| 302 | * new region fits between prev_vma->vm_end and | ||
| 303 | * vma->vm_start, use it: | ||
| 304 | */ | ||
| 305 | if (addr + len <= vma->vm_start && | ||
| 306 | (!prev_vma || (addr >= prev_vma->vm_end))) { | ||
| 307 | /* remember the address as a hint for next time */ | ||
| 308 | mm->cached_hole_size = largest_hole; | ||
| 309 | return (mm->free_area_cache = addr); | ||
| 310 | } else { | ||
| 311 | /* pull free_area_cache down to the first hole */ | ||
| 312 | if (mm->free_area_cache == vma->vm_end) { | ||
| 313 | mm->free_area_cache = vma->vm_start; | ||
| 314 | mm->cached_hole_size = largest_hole; | ||
| 315 | } | ||
| 316 | } | ||
| 317 | |||
| 318 | /* remember the largest hole we saw so far */ | ||
| 319 | if (addr + largest_hole < vma->vm_start) | ||
| 320 | largest_hole = vma->vm_start - addr; | ||
| 321 | |||
| 322 | /* try just below the current vma->vm_start */ | ||
| 323 | addr = (vma->vm_start - len) & HPAGE_MASK; | ||
| 324 | } while (len <= vma->vm_start); | ||
| 325 | |||
| 326 | fail: | ||
| 327 | /* | ||
| 328 | * if hint left us with no space for the requested | ||
| 329 | * mapping then try again: | ||
| 330 | */ | ||
| 331 | if (first_time) { | ||
| 332 | mm->free_area_cache = base; | ||
| 333 | largest_hole = 0; | ||
| 334 | first_time = 0; | ||
| 335 | goto try_again; | ||
| 336 | } | ||
| 337 | /* | ||
| 338 | * A failed mmap() very likely causes application failure, | ||
| 339 | * so fall back to the bottom-up function here. This scenario | ||
| 340 | * can happen with large stack limits and large mmap() | ||
| 341 | * allocations. | ||
| 342 | */ | ||
| 343 | mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
| 344 | mm->cached_hole_size = ~0UL; | ||
| 345 | addr = hugetlb_get_unmapped_area_bottomup(file, addr0, | ||
| 346 | len, pgoff, flags); | ||
| 347 | |||
| 348 | /* | ||
| 349 | * Restore the topdown base: | ||
| 350 | */ | ||
| 351 | mm->free_area_cache = base; | ||
| 352 | mm->cached_hole_size = ~0UL; | ||
| 353 | |||
| 354 | return addr; | ||
| 355 | } | ||
| 356 | |||
| 357 | unsigned long | ||
| 358 | hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | ||
| 359 | unsigned long len, unsigned long pgoff, unsigned long flags) | ||
| 360 | { | ||
| 361 | struct mm_struct *mm = current->mm; | ||
| 362 | struct vm_area_struct *vma; | ||
| 363 | |||
| 364 | if (len & ~HPAGE_MASK) | ||
| 365 | return -EINVAL; | ||
| 366 | if (len > TASK_SIZE) | ||
| 367 | return -ENOMEM; | ||
| 368 | |||
| 369 | if (flags & MAP_FIXED) { | ||
| 370 | if (prepare_hugepage_range(addr, len)) | ||
| 371 | return -EINVAL; | ||
| 372 | return addr; | ||
| 373 | } | ||
| 374 | |||
| 375 | if (addr) { | ||
| 376 | addr = ALIGN(addr, HPAGE_SIZE); | ||
| 377 | vma = find_vma(mm, addr); | ||
| 378 | if (TASK_SIZE - len >= addr && | ||
| 379 | (!vma || addr + len <= vma->vm_start)) | ||
| 380 | return addr; | ||
| 381 | } | ||
| 382 | if (mm->get_unmapped_area == arch_get_unmapped_area) | ||
| 383 | return hugetlb_get_unmapped_area_bottomup(file, addr, len, | ||
| 384 | pgoff, flags); | ||
| 385 | else | ||
| 386 | return hugetlb_get_unmapped_area_topdown(file, addr, len, | ||
| 387 | pgoff, flags); | ||
| 388 | } | ||
| 389 | |||
| 390 | #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ | ||
| 391 | |||
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c new file mode 100644 index 000000000000..730a5b177b1f --- /dev/null +++ b/arch/x86/mm/init_32.c | |||
| @@ -0,0 +1,858 @@ | |||
| 1 | /* | ||
| 2 | * linux/arch/i386/mm/init.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 1995 Linus Torvalds | ||
| 5 | * | ||
| 6 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/module.h> | ||
| 10 | #include <linux/signal.h> | ||
| 11 | #include <linux/sched.h> | ||
| 12 | #include <linux/kernel.h> | ||
| 13 | #include <linux/errno.h> | ||
| 14 | #include <linux/string.h> | ||
| 15 | #include <linux/types.h> | ||
| 16 | #include <linux/ptrace.h> | ||
| 17 | #include <linux/mman.h> | ||
| 18 | #include <linux/mm.h> | ||
| 19 | #include <linux/hugetlb.h> | ||
| 20 | #include <linux/swap.h> | ||
| 21 | #include <linux/smp.h> | ||
| 22 | #include <linux/init.h> | ||
| 23 | #include <linux/highmem.h> | ||
| 24 | #include <linux/pagemap.h> | ||
| 25 | #include <linux/pfn.h> | ||
| 26 | #include <linux/poison.h> | ||
| 27 | #include <linux/bootmem.h> | ||
| 28 | #include <linux/slab.h> | ||
| 29 | #include <linux/proc_fs.h> | ||
| 30 | #include <linux/efi.h> | ||
| 31 | #include <linux/memory_hotplug.h> | ||
| 32 | #include <linux/initrd.h> | ||
| 33 | #include <linux/cpumask.h> | ||
| 34 | |||
| 35 | #include <asm/processor.h> | ||
| 36 | #include <asm/system.h> | ||
| 37 | #include <asm/uaccess.h> | ||
| 38 | #include <asm/pgtable.h> | ||
| 39 | #include <asm/dma.h> | ||
| 40 | #include <asm/fixmap.h> | ||
| 41 | #include <asm/e820.h> | ||
| 42 | #include <asm/apic.h> | ||
| 43 | #include <asm/tlb.h> | ||
| 44 | #include <asm/tlbflush.h> | ||
| 45 | #include <asm/sections.h> | ||
| 46 | #include <asm/paravirt.h> | ||
| 47 | |||
| 48 | unsigned int __VMALLOC_RESERVE = 128 << 20; | ||
| 49 | |||
| 50 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | ||
| 51 | unsigned long highstart_pfn, highend_pfn; | ||
| 52 | |||
| 53 | static int noinline do_test_wp_bit(void); | ||
| 54 | |||
| 55 | /* | ||
| 56 | * Creates a middle page table and puts a pointer to it in the | ||
| 57 | * given global directory entry. This only returns the gd entry | ||
| 58 | * in non-PAE compilation mode, since the middle layer is folded. | ||
| 59 | */ | ||
| 60 | static pmd_t * __init one_md_table_init(pgd_t *pgd) | ||
| 61 | { | ||
| 62 | pud_t *pud; | ||
| 63 | pmd_t *pmd_table; | ||
| 64 | |||
| 65 | #ifdef CONFIG_X86_PAE | ||
| 66 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { | ||
| 67 | pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); | ||
| 68 | |||
| 69 | paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); | ||
| 70 | set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
| 71 | pud = pud_offset(pgd, 0); | ||
| 72 | if (pmd_table != pmd_offset(pud, 0)) | ||
| 73 | BUG(); | ||
| 74 | } | ||
| 75 | #endif | ||
| 76 | pud = pud_offset(pgd, 0); | ||
| 77 | pmd_table = pmd_offset(pud, 0); | ||
| 78 | return pmd_table; | ||
| 79 | } | ||
| 80 | |||
| 81 | /* | ||
| 82 | * Create a page table and place a pointer to it in a middle page | ||
| 83 | * directory entry. | ||
| 84 | */ | ||
| 85 | static pte_t * __init one_page_table_init(pmd_t *pmd) | ||
| 86 | { | ||
| 87 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { | ||
| 88 | pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); | ||
| 89 | |||
| 90 | paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); | ||
| 91 | set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); | ||
| 92 | BUG_ON(page_table != pte_offset_kernel(pmd, 0)); | ||
| 93 | } | ||
| 94 | |||
| 95 | return pte_offset_kernel(pmd, 0); | ||
| 96 | } | ||
| 97 | |||
| 98 | /* | ||
| 99 | * This function initializes a certain range of kernel virtual memory | ||
| 100 | * with new bootmem page tables, everywhere page tables are missing in | ||
| 101 | * the given range. | ||
| 102 | */ | ||
| 103 | |||
| 104 | /* | ||
| 105 | * NOTE: The pagetables are allocated contiguous on the physical space | ||
| 106 | * so we can cache the place of the first one and move around without | ||
| 107 | * checking the pgd every time. | ||
| 108 | */ | ||
| 109 | static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) | ||
| 110 | { | ||
| 111 | pgd_t *pgd; | ||
| 112 | pmd_t *pmd; | ||
| 113 | int pgd_idx, pmd_idx; | ||
| 114 | unsigned long vaddr; | ||
| 115 | |||
| 116 | vaddr = start; | ||
| 117 | pgd_idx = pgd_index(vaddr); | ||
| 118 | pmd_idx = pmd_index(vaddr); | ||
| 119 | pgd = pgd_base + pgd_idx; | ||
| 120 | |||
| 121 | for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { | ||
| 122 | pmd = one_md_table_init(pgd); | ||
| 123 | pmd = pmd + pmd_index(vaddr); | ||
| 124 | for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { | ||
| 125 | one_page_table_init(pmd); | ||
| 126 | |||
| 127 | vaddr += PMD_SIZE; | ||
| 128 | } | ||
| 129 | pmd_idx = 0; | ||
| 130 | } | ||
| 131 | } | ||
| 132 | |||
| 133 | static inline int is_kernel_text(unsigned long addr) | ||
| 134 | { | ||
| 135 | if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) | ||
| 136 | return 1; | ||
| 137 | return 0; | ||
| 138 | } | ||
| 139 | |||
| 140 | /* | ||
| 141 | * This maps the physical memory to kernel virtual address space, a total | ||
| 142 | * of max_low_pfn pages, by creating page tables starting from address | ||
| 143 | * PAGE_OFFSET. | ||
| 144 | */ | ||
| 145 | static void __init kernel_physical_mapping_init(pgd_t *pgd_base) | ||
| 146 | { | ||
| 147 | unsigned long pfn; | ||
| 148 | pgd_t *pgd; | ||
| 149 | pmd_t *pmd; | ||
| 150 | pte_t *pte; | ||
| 151 | int pgd_idx, pmd_idx, pte_ofs; | ||
| 152 | |||
| 153 | pgd_idx = pgd_index(PAGE_OFFSET); | ||
| 154 | pgd = pgd_base + pgd_idx; | ||
| 155 | pfn = 0; | ||
| 156 | |||
| 157 | for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { | ||
| 158 | pmd = one_md_table_init(pgd); | ||
| 159 | if (pfn >= max_low_pfn) | ||
| 160 | continue; | ||
| 161 | for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { | ||
| 162 | unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; | ||
| 163 | |||
| 164 | /* Map with big pages if possible, otherwise create normal page tables. */ | ||
| 165 | if (cpu_has_pse) { | ||
| 166 | unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; | ||
| 167 | if (is_kernel_text(address) || is_kernel_text(address2)) | ||
| 168 | set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); | ||
| 169 | else | ||
| 170 | set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); | ||
| 171 | |||
| 172 | pfn += PTRS_PER_PTE; | ||
| 173 | } else { | ||
| 174 | pte = one_page_table_init(pmd); | ||
| 175 | |||
| 176 | for (pte_ofs = 0; | ||
| 177 | pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; | ||
| 178 | pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { | ||
| 179 | if (is_kernel_text(address)) | ||
| 180 | set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); | ||
| 181 | else | ||
| 182 | set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); | ||
| 183 | } | ||
| 184 | } | ||
| 185 | } | ||
| 186 | } | ||
| 187 | } | ||
| 188 | |||
| 189 | static inline int page_kills_ppro(unsigned long pagenr) | ||
| 190 | { | ||
| 191 | if (pagenr >= 0x70000 && pagenr <= 0x7003F) | ||
| 192 | return 1; | ||
| 193 | return 0; | ||
| 194 | } | ||
| 195 | |||
| 196 | int page_is_ram(unsigned long pagenr) | ||
| 197 | { | ||
| 198 | int i; | ||
| 199 | unsigned long addr, end; | ||
| 200 | |||
| 201 | if (efi_enabled) { | ||
| 202 | efi_memory_desc_t *md; | ||
| 203 | void *p; | ||
| 204 | |||
| 205 | for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { | ||
| 206 | md = p; | ||
| 207 | if (!is_available_memory(md)) | ||
| 208 | continue; | ||
| 209 | addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; | ||
| 210 | end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; | ||
| 211 | |||
| 212 | if ((pagenr >= addr) && (pagenr < end)) | ||
| 213 | return 1; | ||
| 214 | } | ||
| 215 | return 0; | ||
| 216 | } | ||
| 217 | |||
| 218 | for (i = 0; i < e820.nr_map; i++) { | ||
| 219 | |||
| 220 | if (e820.map[i].type != E820_RAM) /* not usable memory */ | ||
| 221 | continue; | ||
| 222 | /* | ||
| 223 | * !!!FIXME!!! Some BIOSen report areas as RAM that | ||
| 224 | * are not. Notably the 640->1Mb area. We need a sanity | ||
| 225 | * check here. | ||
| 226 | */ | ||
| 227 | addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; | ||
| 228 | end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; | ||
| 229 | if ((pagenr >= addr) && (pagenr < end)) | ||
| 230 | return 1; | ||
| 231 | } | ||
| 232 | return 0; | ||
| 233 | } | ||
| 234 | |||
| 235 | #ifdef CONFIG_HIGHMEM | ||
| 236 | pte_t *kmap_pte; | ||
| 237 | pgprot_t kmap_prot; | ||
| 238 | |||
| 239 | #define kmap_get_fixmap_pte(vaddr) \ | ||
| 240 | pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) | ||
| 241 | |||
| 242 | static void __init kmap_init(void) | ||
| 243 | { | ||
| 244 | unsigned long kmap_vstart; | ||
| 245 | |||
| 246 | /* cache the first kmap pte */ | ||
| 247 | kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); | ||
| 248 | kmap_pte = kmap_get_fixmap_pte(kmap_vstart); | ||
| 249 | |||
| 250 | kmap_prot = PAGE_KERNEL; | ||
| 251 | } | ||
| 252 | |||
| 253 | static void __init permanent_kmaps_init(pgd_t *pgd_base) | ||
| 254 | { | ||
| 255 | pgd_t *pgd; | ||
| 256 | pud_t *pud; | ||
| 257 | pmd_t *pmd; | ||
| 258 | pte_t *pte; | ||
| 259 | unsigned long vaddr; | ||
| 260 | |||
| 261 | vaddr = PKMAP_BASE; | ||
| 262 | page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); | ||
| 263 | |||
| 264 | pgd = swapper_pg_dir + pgd_index(vaddr); | ||
| 265 | pud = pud_offset(pgd, vaddr); | ||
| 266 | pmd = pmd_offset(pud, vaddr); | ||
| 267 | pte = pte_offset_kernel(pmd, vaddr); | ||
| 268 | pkmap_page_table = pte; | ||
| 269 | } | ||
| 270 | |||
| 271 | static void __meminit free_new_highpage(struct page *page) | ||
| 272 | { | ||
| 273 | init_page_count(page); | ||
| 274 | __free_page(page); | ||
| 275 | totalhigh_pages++; | ||
| 276 | } | ||
| 277 | |||
| 278 | void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) | ||
| 279 | { | ||
| 280 | if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { | ||
| 281 | ClearPageReserved(page); | ||
| 282 | free_new_highpage(page); | ||
| 283 | } else | ||
| 284 | SetPageReserved(page); | ||
| 285 | } | ||
| 286 | |||
| 287 | static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn) | ||
| 288 | { | ||
| 289 | free_new_highpage(page); | ||
| 290 | totalram_pages++; | ||
| 291 | #ifdef CONFIG_FLATMEM | ||
| 292 | max_mapnr = max(pfn, max_mapnr); | ||
| 293 | #endif | ||
| 294 | num_physpages++; | ||
| 295 | return 0; | ||
| 296 | } | ||
| 297 | |||
| 298 | /* | ||
| 299 | * Not currently handling the NUMA case. | ||
| 300 | * Assuming single node and all memory that | ||
| 301 | * has been added dynamically that would be | ||
| 302 | * onlined here is in HIGHMEM | ||
| 303 | */ | ||
| 304 | void __meminit online_page(struct page *page) | ||
| 305 | { | ||
| 306 | ClearPageReserved(page); | ||
| 307 | add_one_highpage_hotplug(page, page_to_pfn(page)); | ||
| 308 | } | ||
| 309 | |||
| 310 | |||
| 311 | #ifdef CONFIG_NUMA | ||
| 312 | extern void set_highmem_pages_init(int); | ||
| 313 | #else | ||
| 314 | static void __init set_highmem_pages_init(int bad_ppro) | ||
| 315 | { | ||
| 316 | int pfn; | ||
| 317 | for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) | ||
| 318 | add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); | ||
| 319 | totalram_pages += totalhigh_pages; | ||
| 320 | } | ||
| 321 | #endif /* CONFIG_FLATMEM */ | ||
| 322 | |||
| 323 | #else | ||
| 324 | #define kmap_init() do { } while (0) | ||
| 325 | #define permanent_kmaps_init(pgd_base) do { } while (0) | ||
| 326 | #define set_highmem_pages_init(bad_ppro) do { } while (0) | ||
| 327 | #endif /* CONFIG_HIGHMEM */ | ||
| 328 | |||
| 329 | unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; | ||
| 330 | EXPORT_SYMBOL(__PAGE_KERNEL); | ||
| 331 | unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; | ||
| 332 | |||
| 333 | #ifdef CONFIG_NUMA | ||
| 334 | extern void __init remap_numa_kva(void); | ||
| 335 | #else | ||
| 336 | #define remap_numa_kva() do {} while (0) | ||
| 337 | #endif | ||
| 338 | |||
| 339 | void __init native_pagetable_setup_start(pgd_t *base) | ||
| 340 | { | ||
| 341 | #ifdef CONFIG_X86_PAE | ||
| 342 | int i; | ||
| 343 | |||
| 344 | /* | ||
| 345 | * Init entries of the first-level page table to the | ||
| 346 | * zero page, if they haven't already been set up. | ||
| 347 | * | ||
| 348 | * In a normal native boot, we'll be running on a | ||
| 349 | * pagetable rooted in swapper_pg_dir, but not in PAE | ||
| 350 | * mode, so this will end up clobbering the mappings | ||
| 351 | * for the lower 24Mbytes of the address space, | ||
| 352 | * without affecting the kernel address space. | ||
| 353 | */ | ||
| 354 | for (i = 0; i < USER_PTRS_PER_PGD; i++) | ||
| 355 | set_pgd(&base[i], | ||
| 356 | __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); | ||
| 357 | |||
| 358 | /* Make sure kernel address space is empty so that a pagetable | ||
| 359 | will be allocated for it. */ | ||
| 360 | memset(&base[USER_PTRS_PER_PGD], 0, | ||
| 361 | KERNEL_PGD_PTRS * sizeof(pgd_t)); | ||
| 362 | #else | ||
| 363 | paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); | ||
| 364 | #endif | ||
| 365 | } | ||
| 366 | |||
| 367 | void __init native_pagetable_setup_done(pgd_t *base) | ||
| 368 | { | ||
| 369 | #ifdef CONFIG_X86_PAE | ||
| 370 | /* | ||
| 371 | * Add low memory identity-mappings - SMP needs it when | ||
| 372 | * starting up on an AP from real-mode. In the non-PAE | ||
| 373 | * case we already have these mappings through head.S. | ||
| 374 | * All user-space mappings are explicitly cleared after | ||
| 375 | * SMP startup. | ||
| 376 | */ | ||
| 377 | set_pgd(&base[0], base[USER_PTRS_PER_PGD]); | ||
| 378 | #endif | ||
| 379 | } | ||
| 380 | |||
| 381 | /* | ||
| 382 | * Build a proper pagetable for the kernel mappings. Up until this | ||
| 383 | * point, we've been running on some set of pagetables constructed by | ||
| 384 | * the boot process. | ||
| 385 | * | ||
| 386 | * If we're booting on native hardware, this will be a pagetable | ||
| 387 | * constructed in arch/i386/kernel/head.S, and not running in PAE mode | ||
| 388 | * (even if we'll end up running in PAE). The root of the pagetable | ||
| 389 | * will be swapper_pg_dir. | ||
| 390 | * | ||
| 391 | * If we're booting paravirtualized under a hypervisor, then there are | ||
| 392 | * more options: we may already be running PAE, and the pagetable may | ||
| 393 | * or may not be based in swapper_pg_dir. In any case, | ||
| 394 | * paravirt_pagetable_setup_start() will set up swapper_pg_dir | ||
| 395 | * appropriately for the rest of the initialization to work. | ||
| 396 | * | ||
| 397 | * In general, pagetable_init() assumes that the pagetable may already | ||
| 398 | * be partially populated, and so it avoids stomping on any existing | ||
| 399 | * mappings. | ||
| 400 | */ | ||
| 401 | static void __init pagetable_init (void) | ||
| 402 | { | ||
| 403 | unsigned long vaddr, end; | ||
| 404 | pgd_t *pgd_base = swapper_pg_dir; | ||
| 405 | |||
| 406 | paravirt_pagetable_setup_start(pgd_base); | ||
| 407 | |||
| 408 | /* Enable PSE if available */ | ||
| 409 | if (cpu_has_pse) | ||
| 410 | set_in_cr4(X86_CR4_PSE); | ||
| 411 | |||
| 412 | /* Enable PGE if available */ | ||
| 413 | if (cpu_has_pge) { | ||
| 414 | set_in_cr4(X86_CR4_PGE); | ||
| 415 | __PAGE_KERNEL |= _PAGE_GLOBAL; | ||
| 416 | __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; | ||
| 417 | } | ||
| 418 | |||
| 419 | kernel_physical_mapping_init(pgd_base); | ||
| 420 | remap_numa_kva(); | ||
| 421 | |||
| 422 | /* | ||
| 423 | * Fixed mappings, only the page table structure has to be | ||
| 424 | * created - mappings will be set by set_fixmap(): | ||
| 425 | */ | ||
| 426 | vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; | ||
| 427 | end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; | ||
| 428 | page_table_range_init(vaddr, end, pgd_base); | ||
| 429 | |||
| 430 | permanent_kmaps_init(pgd_base); | ||
| 431 | |||
| 432 | paravirt_pagetable_setup_done(pgd_base); | ||
| 433 | } | ||
| 434 | |||
| 435 | #if defined(CONFIG_HIBERNATION) || defined(CONFIG_ACPI) | ||
| 436 | /* | ||
| 437 | * Swap suspend & friends need this for resume because things like the intel-agp | ||
| 438 | * driver might have split up a kernel 4MB mapping. | ||
| 439 | */ | ||
| 440 | char __nosavedata swsusp_pg_dir[PAGE_SIZE] | ||
| 441 | __attribute__ ((aligned (PAGE_SIZE))); | ||
| 442 | |||
| 443 | static inline void save_pg_dir(void) | ||
| 444 | { | ||
| 445 | memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); | ||
| 446 | } | ||
| 447 | #else | ||
| 448 | static inline void save_pg_dir(void) | ||
| 449 | { | ||
| 450 | } | ||
| 451 | #endif | ||
| 452 | |||
| 453 | void zap_low_mappings (void) | ||
| 454 | { | ||
| 455 | int i; | ||
| 456 | |||
| 457 | save_pg_dir(); | ||
| 458 | |||
| 459 | /* | ||
| 460 | * Zap initial low-memory mappings. | ||
| 461 | * | ||
| 462 | * Note that "pgd_clear()" doesn't do it for | ||
| 463 | * us, because pgd_clear() is a no-op on i386. | ||
| 464 | */ | ||
| 465 | for (i = 0; i < USER_PTRS_PER_PGD; i++) | ||
| 466 | #ifdef CONFIG_X86_PAE | ||
| 467 | set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); | ||
| 468 | #else | ||
| 469 | set_pgd(swapper_pg_dir+i, __pgd(0)); | ||
| 470 | #endif | ||
| 471 | flush_tlb_all(); | ||
| 472 | } | ||
| 473 | |||
| 474 | int nx_enabled = 0; | ||
| 475 | |||
| 476 | #ifdef CONFIG_X86_PAE | ||
| 477 | |||
| 478 | static int disable_nx __initdata = 0; | ||
| 479 | u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; | ||
| 480 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | ||
| 481 | |||
| 482 | /* | ||
| 483 | * noexec = on|off | ||
| 484 | * | ||
| 485 | * Control non executable mappings. | ||
| 486 | * | ||
| 487 | * on Enable | ||
| 488 | * off Disable | ||
| 489 | */ | ||
| 490 | static int __init noexec_setup(char *str) | ||
| 491 | { | ||
| 492 | if (!str || !strcmp(str, "on")) { | ||
| 493 | if (cpu_has_nx) { | ||
| 494 | __supported_pte_mask |= _PAGE_NX; | ||
| 495 | disable_nx = 0; | ||
| 496 | } | ||
| 497 | } else if (!strcmp(str,"off")) { | ||
| 498 | disable_nx = 1; | ||
| 499 | __supported_pte_mask &= ~_PAGE_NX; | ||
| 500 | } else | ||
| 501 | return -EINVAL; | ||
| 502 | |||
| 503 | return 0; | ||
| 504 | } | ||
| 505 | early_param("noexec", noexec_setup); | ||
| 506 | |||
| 507 | static void __init set_nx(void) | ||
| 508 | { | ||
| 509 | unsigned int v[4], l, h; | ||
| 510 | |||
| 511 | if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { | ||
| 512 | cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); | ||
| 513 | if ((v[3] & (1 << 20)) && !disable_nx) { | ||
| 514 | rdmsr(MSR_EFER, l, h); | ||
| 515 | l |= EFER_NX; | ||
| 516 | wrmsr(MSR_EFER, l, h); | ||
| 517 | nx_enabled = 1; | ||
| 518 | __supported_pte_mask |= _PAGE_NX; | ||
| 519 | } | ||
| 520 | } | ||
| 521 | } | ||
| 522 | |||
| 523 | /* | ||
| 524 | * Enables/disables executability of a given kernel page and | ||
| 525 | * returns the previous setting. | ||
| 526 | */ | ||
| 527 | int __init set_kernel_exec(unsigned long vaddr, int enable) | ||
| 528 | { | ||
| 529 | pte_t *pte; | ||
| 530 | int ret = 1; | ||
| 531 | |||
| 532 | if (!nx_enabled) | ||
| 533 | goto out; | ||
| 534 | |||
| 535 | pte = lookup_address(vaddr); | ||
| 536 | BUG_ON(!pte); | ||
| 537 | |||
| 538 | if (!pte_exec_kernel(*pte)) | ||
| 539 | ret = 0; | ||
| 540 | |||
| 541 | if (enable) | ||
| 542 | pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); | ||
| 543 | else | ||
| 544 | pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); | ||
| 545 | pte_update_defer(&init_mm, vaddr, pte); | ||
| 546 | __flush_tlb_all(); | ||
| 547 | out: | ||
| 548 | return ret; | ||
| 549 | } | ||
| 550 | |||
| 551 | #endif | ||
| 552 | |||
| 553 | /* | ||
| 554 | * paging_init() sets up the page tables - note that the first 8MB are | ||
| 555 | * already mapped by head.S. | ||
| 556 | * | ||
| 557 | * This routines also unmaps the page at virtual kernel address 0, so | ||
| 558 | * that we can trap those pesky NULL-reference errors in the kernel. | ||
| 559 | */ | ||
| 560 | void __init paging_init(void) | ||
| 561 | { | ||
| 562 | #ifdef CONFIG_X86_PAE | ||
| 563 | set_nx(); | ||
| 564 | if (nx_enabled) | ||
| 565 | printk("NX (Execute Disable) protection: active\n"); | ||
| 566 | #endif | ||
| 567 | |||
| 568 | pagetable_init(); | ||
| 569 | |||
| 570 | load_cr3(swapper_pg_dir); | ||
| 571 | |||
| 572 | #ifdef CONFIG_X86_PAE | ||
| 573 | /* | ||
| 574 | * We will bail out later - printk doesn't work right now so | ||
| 575 | * the user would just see a hanging kernel. | ||
| 576 | */ | ||
| 577 | if (cpu_has_pae) | ||
| 578 | set_in_cr4(X86_CR4_PAE); | ||
| 579 | #endif | ||
| 580 | __flush_tlb_all(); | ||
| 581 | |||
| 582 | kmap_init(); | ||
| 583 | } | ||
| 584 | |||
| 585 | /* | ||
| 586 | * Test if the WP bit works in supervisor mode. It isn't supported on 386's | ||
| 587 | * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This | ||
| 588 | * used to involve black magic jumps to work around some nasty CPU bugs, | ||
| 589 | * but fortunately the switch to using exceptions got rid of all that. | ||
| 590 | */ | ||
| 591 | |||
| 592 | static void __init test_wp_bit(void) | ||
| 593 | { | ||
| 594 | printk("Checking if this processor honours the WP bit even in supervisor mode... "); | ||
| 595 | |||
| 596 | /* Any page-aligned address will do, the test is non-destructive */ | ||
| 597 | __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); | ||
| 598 | boot_cpu_data.wp_works_ok = do_test_wp_bit(); | ||
| 599 | clear_fixmap(FIX_WP_TEST); | ||
| 600 | |||
| 601 | if (!boot_cpu_data.wp_works_ok) { | ||
| 602 | printk("No.\n"); | ||
| 603 | #ifdef CONFIG_X86_WP_WORKS_OK | ||
| 604 | panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); | ||
| 605 | #endif | ||
| 606 | } else { | ||
| 607 | printk("Ok.\n"); | ||
| 608 | } | ||
| 609 | } | ||
| 610 | |||
| 611 | static struct kcore_list kcore_mem, kcore_vmalloc; | ||
| 612 | |||
| 613 | void __init mem_init(void) | ||
| 614 | { | ||
| 615 | extern int ppro_with_ram_bug(void); | ||
| 616 | int codesize, reservedpages, datasize, initsize; | ||
| 617 | int tmp; | ||
| 618 | int bad_ppro; | ||
| 619 | |||
| 620 | #ifdef CONFIG_FLATMEM | ||
| 621 | BUG_ON(!mem_map); | ||
| 622 | #endif | ||
| 623 | |||
| 624 | bad_ppro = ppro_with_ram_bug(); | ||
| 625 | |||
| 626 | #ifdef CONFIG_HIGHMEM | ||
| 627 | /* check that fixmap and pkmap do not overlap */ | ||
| 628 | if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { | ||
| 629 | printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); | ||
| 630 | printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", | ||
| 631 | PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); | ||
| 632 | BUG(); | ||
| 633 | } | ||
| 634 | #endif | ||
| 635 | |||
| 636 | /* this will put all low memory onto the freelists */ | ||
| 637 | totalram_pages += free_all_bootmem(); | ||
| 638 | |||
| 639 | reservedpages = 0; | ||
| 640 | for (tmp = 0; tmp < max_low_pfn; tmp++) | ||
| 641 | /* | ||
| 642 | * Only count reserved RAM pages | ||
| 643 | */ | ||
| 644 | if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) | ||
| 645 | reservedpages++; | ||
| 646 | |||
| 647 | set_highmem_pages_init(bad_ppro); | ||
| 648 | |||
| 649 | codesize = (unsigned long) &_etext - (unsigned long) &_text; | ||
| 650 | datasize = (unsigned long) &_edata - (unsigned long) &_etext; | ||
| 651 | initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; | ||
| 652 | |||
| 653 | kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); | ||
| 654 | kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, | ||
| 655 | VMALLOC_END-VMALLOC_START); | ||
| 656 | |||
| 657 | printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", | ||
| 658 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | ||
| 659 | num_physpages << (PAGE_SHIFT-10), | ||
| 660 | codesize >> 10, | ||
| 661 | reservedpages << (PAGE_SHIFT-10), | ||
| 662 | datasize >> 10, | ||
| 663 | initsize >> 10, | ||
| 664 | (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) | ||
| 665 | ); | ||
| 666 | |||
| 667 | #if 1 /* double-sanity-check paranoia */ | ||
| 668 | printk("virtual kernel memory layout:\n" | ||
| 669 | " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" | ||
| 670 | #ifdef CONFIG_HIGHMEM | ||
| 671 | " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" | ||
| 672 | #endif | ||
| 673 | " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" | ||
| 674 | " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" | ||
| 675 | " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" | ||
| 676 | " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" | ||
| 677 | " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", | ||
| 678 | FIXADDR_START, FIXADDR_TOP, | ||
| 679 | (FIXADDR_TOP - FIXADDR_START) >> 10, | ||
| 680 | |||
| 681 | #ifdef CONFIG_HIGHMEM | ||
| 682 | PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, | ||
| 683 | (LAST_PKMAP*PAGE_SIZE) >> 10, | ||
| 684 | #endif | ||
| 685 | |||
| 686 | VMALLOC_START, VMALLOC_END, | ||
| 687 | (VMALLOC_END - VMALLOC_START) >> 20, | ||
| 688 | |||
| 689 | (unsigned long)__va(0), (unsigned long)high_memory, | ||
| 690 | ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, | ||
| 691 | |||
| 692 | (unsigned long)&__init_begin, (unsigned long)&__init_end, | ||
| 693 | ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10, | ||
| 694 | |||
| 695 | (unsigned long)&_etext, (unsigned long)&_edata, | ||
| 696 | ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, | ||
| 697 | |||
| 698 | (unsigned long)&_text, (unsigned long)&_etext, | ||
| 699 | ((unsigned long)&_etext - (unsigned long)&_text) >> 10); | ||
| 700 | |||
| 701 | #ifdef CONFIG_HIGHMEM | ||
| 702 | BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START); | ||
| 703 | BUG_ON(VMALLOC_END > PKMAP_BASE); | ||
| 704 | #endif | ||
| 705 | BUG_ON(VMALLOC_START > VMALLOC_END); | ||
| 706 | BUG_ON((unsigned long)high_memory > VMALLOC_START); | ||
| 707 | #endif /* double-sanity-check paranoia */ | ||
| 708 | |||
| 709 | #ifdef CONFIG_X86_PAE | ||
| 710 | if (!cpu_has_pae) | ||
| 711 | panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); | ||
| 712 | #endif | ||
| 713 | if (boot_cpu_data.wp_works_ok < 0) | ||
| 714 | test_wp_bit(); | ||
| 715 | |||
| 716 | /* | ||
| 717 | * Subtle. SMP is doing it's boot stuff late (because it has to | ||
| 718 | * fork idle threads) - but it also needs low mappings for the | ||
| 719 | * protected-mode entry to work. We zap these entries only after | ||
| 720 | * the WP-bit has been tested. | ||
| 721 | */ | ||
| 722 | #ifndef CONFIG_SMP | ||
| 723 | zap_low_mappings(); | ||
| 724 | #endif | ||
| 725 | } | ||
| 726 | |||
| 727 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
| 728 | int arch_add_memory(int nid, u64 start, u64 size) | ||
| 729 | { | ||
| 730 | struct pglist_data *pgdata = NODE_DATA(nid); | ||
| 731 | struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM; | ||
| 732 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
| 733 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
| 734 | |||
| 735 | return __add_pages(zone, start_pfn, nr_pages); | ||
| 736 | } | ||
| 737 | |||
| 738 | int remove_memory(u64 start, u64 size) | ||
| 739 | { | ||
| 740 | return -EINVAL; | ||
| 741 | } | ||
| 742 | EXPORT_SYMBOL_GPL(remove_memory); | ||
| 743 | #endif | ||
| 744 | |||
| 745 | struct kmem_cache *pmd_cache; | ||
| 746 | |||
| 747 | void __init pgtable_cache_init(void) | ||
| 748 | { | ||
| 749 | size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); | ||
| 750 | |||
| 751 | if (PTRS_PER_PMD > 1) { | ||
| 752 | pmd_cache = kmem_cache_create("pmd", | ||
| 753 | PTRS_PER_PMD*sizeof(pmd_t), | ||
| 754 | PTRS_PER_PMD*sizeof(pmd_t), | ||
| 755 | SLAB_PANIC, | ||
| 756 | pmd_ctor); | ||
| 757 | if (!SHARED_KERNEL_PMD) { | ||
| 758 | /* If we're in PAE mode and have a non-shared | ||
| 759 | kernel pmd, then the pgd size must be a | ||
| 760 | page size. This is because the pgd_list | ||
| 761 | links through the page structure, so there | ||
| 762 | can only be one pgd per page for this to | ||
| 763 | work. */ | ||
| 764 | pgd_size = PAGE_SIZE; | ||
| 765 | } | ||
| 766 | } | ||
| 767 | } | ||
| 768 | |||
| 769 | /* | ||
| 770 | * This function cannot be __init, since exceptions don't work in that | ||
| 771 | * section. Put this after the callers, so that it cannot be inlined. | ||
| 772 | */ | ||
| 773 | static int noinline do_test_wp_bit(void) | ||
| 774 | { | ||
| 775 | char tmp_reg; | ||
| 776 | int flag; | ||
| 777 | |||
| 778 | __asm__ __volatile__( | ||
| 779 | " movb %0,%1 \n" | ||
| 780 | "1: movb %1,%0 \n" | ||
| 781 | " xorl %2,%2 \n" | ||
| 782 | "2: \n" | ||
| 783 | ".section __ex_table,\"a\"\n" | ||
| 784 | " .align 4 \n" | ||
| 785 | " .long 1b,2b \n" | ||
| 786 | ".previous \n" | ||
| 787 | :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), | ||
| 788 | "=q" (tmp_reg), | ||
| 789 | "=r" (flag) | ||
| 790 | :"2" (1) | ||
| 791 | :"memory"); | ||
| 792 | |||
| 793 | return flag; | ||
| 794 | } | ||
| 795 | |||
| 796 | #ifdef CONFIG_DEBUG_RODATA | ||
| 797 | |||
| 798 | void mark_rodata_ro(void) | ||
| 799 | { | ||
| 800 | unsigned long start = PFN_ALIGN(_text); | ||
| 801 | unsigned long size = PFN_ALIGN(_etext) - start; | ||
| 802 | |||
| 803 | #ifndef CONFIG_KPROBES | ||
| 804 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 805 | /* It must still be possible to apply SMP alternatives. */ | ||
| 806 | if (num_possible_cpus() <= 1) | ||
| 807 | #endif | ||
| 808 | { | ||
| 809 | change_page_attr(virt_to_page(start), | ||
| 810 | size >> PAGE_SHIFT, PAGE_KERNEL_RX); | ||
| 811 | printk("Write protecting the kernel text: %luk\n", size >> 10); | ||
| 812 | } | ||
| 813 | #endif | ||
| 814 | start += size; | ||
| 815 | size = (unsigned long)__end_rodata - start; | ||
| 816 | change_page_attr(virt_to_page(start), | ||
| 817 | size >> PAGE_SHIFT, PAGE_KERNEL_RO); | ||
| 818 | printk("Write protecting the kernel read-only data: %luk\n", | ||
| 819 | size >> 10); | ||
| 820 | |||
| 821 | /* | ||
| 822 | * change_page_attr() requires a global_flush_tlb() call after it. | ||
| 823 | * We do this after the printk so that if something went wrong in the | ||
| 824 | * change, the printk gets out at least to give a better debug hint | ||
| 825 | * of who is the culprit. | ||
| 826 | */ | ||
| 827 | global_flush_tlb(); | ||
| 828 | } | ||
| 829 | #endif | ||
| 830 | |||
| 831 | void free_init_pages(char *what, unsigned long begin, unsigned long end) | ||
| 832 | { | ||
| 833 | unsigned long addr; | ||
| 834 | |||
| 835 | for (addr = begin; addr < end; addr += PAGE_SIZE) { | ||
| 836 | ClearPageReserved(virt_to_page(addr)); | ||
| 837 | init_page_count(virt_to_page(addr)); | ||
| 838 | memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); | ||
| 839 | free_page(addr); | ||
| 840 | totalram_pages++; | ||
| 841 | } | ||
| 842 | printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); | ||
| 843 | } | ||
| 844 | |||
| 845 | void free_initmem(void) | ||
| 846 | { | ||
| 847 | free_init_pages("unused kernel memory", | ||
| 848 | (unsigned long)(&__init_begin), | ||
| 849 | (unsigned long)(&__init_end)); | ||
| 850 | } | ||
| 851 | |||
| 852 | #ifdef CONFIG_BLK_DEV_INITRD | ||
| 853 | void free_initrd_mem(unsigned long start, unsigned long end) | ||
| 854 | { | ||
| 855 | free_init_pages("initrd memory", start, end); | ||
| 856 | } | ||
| 857 | #endif | ||
| 858 | |||
diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c new file mode 100644 index 000000000000..0b278315d737 --- /dev/null +++ b/arch/x86/mm/ioremap_32.c | |||
| @@ -0,0 +1,274 @@ | |||
| 1 | /* | ||
| 2 | * arch/i386/mm/ioremap.c | ||
| 3 | * | ||
| 4 | * Re-map IO memory to kernel address space so that we can access it. | ||
| 5 | * This is needed for high PCI addresses that aren't mapped in the | ||
| 6 | * 640k-1MB IO memory area on PC's | ||
| 7 | * | ||
| 8 | * (C) Copyright 1995 1996 Linus Torvalds | ||
| 9 | */ | ||
| 10 | |||
| 11 | #include <linux/vmalloc.h> | ||
| 12 | #include <linux/init.h> | ||
| 13 | #include <linux/slab.h> | ||
| 14 | #include <linux/module.h> | ||
| 15 | #include <linux/io.h> | ||
| 16 | #include <asm/fixmap.h> | ||
| 17 | #include <asm/cacheflush.h> | ||
| 18 | #include <asm/tlbflush.h> | ||
| 19 | #include <asm/pgtable.h> | ||
| 20 | |||
| 21 | #define ISA_START_ADDRESS 0xa0000 | ||
| 22 | #define ISA_END_ADDRESS 0x100000 | ||
| 23 | |||
| 24 | /* | ||
| 25 | * Generic mapping function (not visible outside): | ||
| 26 | */ | ||
| 27 | |||
| 28 | /* | ||
| 29 | * Remap an arbitrary physical address space into the kernel virtual | ||
| 30 | * address space. Needed when the kernel wants to access high addresses | ||
| 31 | * directly. | ||
| 32 | * | ||
| 33 | * NOTE! We need to allow non-page-aligned mappings too: we will obviously | ||
| 34 | * have to convert them into an offset in a page-aligned mapping, but the | ||
| 35 | * caller shouldn't need to know that small detail. | ||
| 36 | */ | ||
| 37 | void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) | ||
| 38 | { | ||
| 39 | void __iomem * addr; | ||
| 40 | struct vm_struct * area; | ||
| 41 | unsigned long offset, last_addr; | ||
| 42 | pgprot_t prot; | ||
| 43 | |||
| 44 | /* Don't allow wraparound or zero size */ | ||
| 45 | last_addr = phys_addr + size - 1; | ||
| 46 | if (!size || last_addr < phys_addr) | ||
| 47 | return NULL; | ||
| 48 | |||
| 49 | /* | ||
| 50 | * Don't remap the low PCI/ISA area, it's always mapped.. | ||
| 51 | */ | ||
| 52 | if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) | ||
| 53 | return (void __iomem *) phys_to_virt(phys_addr); | ||
| 54 | |||
| 55 | /* | ||
| 56 | * Don't allow anybody to remap normal RAM that we're using.. | ||
| 57 | */ | ||
| 58 | if (phys_addr <= virt_to_phys(high_memory - 1)) { | ||
| 59 | char *t_addr, *t_end; | ||
| 60 | struct page *page; | ||
| 61 | |||
| 62 | t_addr = __va(phys_addr); | ||
| 63 | t_end = t_addr + (size - 1); | ||
| 64 | |||
| 65 | for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) | ||
| 66 | if(!PageReserved(page)) | ||
| 67 | return NULL; | ||
| 68 | } | ||
| 69 | |||
| 70 | prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | ||
| 71 | | _PAGE_ACCESSED | flags); | ||
| 72 | |||
| 73 | /* | ||
| 74 | * Mappings have to be page-aligned | ||
| 75 | */ | ||
| 76 | offset = phys_addr & ~PAGE_MASK; | ||
| 77 | phys_addr &= PAGE_MASK; | ||
| 78 | size = PAGE_ALIGN(last_addr+1) - phys_addr; | ||
| 79 | |||
| 80 | /* | ||
| 81 | * Ok, go for it.. | ||
| 82 | */ | ||
| 83 | area = get_vm_area(size, VM_IOREMAP | (flags << 20)); | ||
| 84 | if (!area) | ||
| 85 | return NULL; | ||
| 86 | area->phys_addr = phys_addr; | ||
| 87 | addr = (void __iomem *) area->addr; | ||
| 88 | if (ioremap_page_range((unsigned long) addr, | ||
| 89 | (unsigned long) addr + size, phys_addr, prot)) { | ||
| 90 | vunmap((void __force *) addr); | ||
| 91 | return NULL; | ||
| 92 | } | ||
| 93 | return (void __iomem *) (offset + (char __iomem *)addr); | ||
| 94 | } | ||
| 95 | EXPORT_SYMBOL(__ioremap); | ||
| 96 | |||
| 97 | /** | ||
| 98 | * ioremap_nocache - map bus memory into CPU space | ||
| 99 | * @offset: bus address of the memory | ||
| 100 | * @size: size of the resource to map | ||
| 101 | * | ||
| 102 | * ioremap_nocache performs a platform specific sequence of operations to | ||
| 103 | * make bus memory CPU accessible via the readb/readw/readl/writeb/ | ||
| 104 | * writew/writel functions and the other mmio helpers. The returned | ||
| 105 | * address is not guaranteed to be usable directly as a virtual | ||
| 106 | * address. | ||
| 107 | * | ||
| 108 | * This version of ioremap ensures that the memory is marked uncachable | ||
| 109 | * on the CPU as well as honouring existing caching rules from things like | ||
| 110 | * the PCI bus. Note that there are other caches and buffers on many | ||
| 111 | * busses. In particular driver authors should read up on PCI writes | ||
| 112 | * | ||
| 113 | * It's useful if some control registers are in such an area and | ||
| 114 | * write combining or read caching is not desirable: | ||
| 115 | * | ||
| 116 | * Must be freed with iounmap. | ||
| 117 | */ | ||
| 118 | |||
| 119 | void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) | ||
| 120 | { | ||
| 121 | unsigned long last_addr; | ||
| 122 | void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); | ||
| 123 | if (!p) | ||
| 124 | return p; | ||
| 125 | |||
| 126 | /* Guaranteed to be > phys_addr, as per __ioremap() */ | ||
| 127 | last_addr = phys_addr + size - 1; | ||
| 128 | |||
| 129 | if (last_addr < virt_to_phys(high_memory) - 1) { | ||
| 130 | struct page *ppage = virt_to_page(__va(phys_addr)); | ||
| 131 | unsigned long npages; | ||
| 132 | |||
| 133 | phys_addr &= PAGE_MASK; | ||
| 134 | |||
| 135 | /* This might overflow and become zero.. */ | ||
| 136 | last_addr = PAGE_ALIGN(last_addr); | ||
| 137 | |||
| 138 | /* .. but that's ok, because modulo-2**n arithmetic will make | ||
| 139 | * the page-aligned "last - first" come out right. | ||
| 140 | */ | ||
| 141 | npages = (last_addr - phys_addr) >> PAGE_SHIFT; | ||
| 142 | |||
| 143 | if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { | ||
| 144 | iounmap(p); | ||
| 145 | p = NULL; | ||
| 146 | } | ||
| 147 | global_flush_tlb(); | ||
| 148 | } | ||
| 149 | |||
| 150 | return p; | ||
| 151 | } | ||
| 152 | EXPORT_SYMBOL(ioremap_nocache); | ||
| 153 | |||
| 154 | /** | ||
| 155 | * iounmap - Free a IO remapping | ||
| 156 | * @addr: virtual address from ioremap_* | ||
| 157 | * | ||
| 158 | * Caller must ensure there is only one unmapping for the same pointer. | ||
| 159 | */ | ||
| 160 | void iounmap(volatile void __iomem *addr) | ||
| 161 | { | ||
| 162 | struct vm_struct *p, *o; | ||
| 163 | |||
| 164 | if ((void __force *)addr <= high_memory) | ||
| 165 | return; | ||
| 166 | |||
| 167 | /* | ||
| 168 | * __ioremap special-cases the PCI/ISA range by not instantiating a | ||
| 169 | * vm_area and by simply returning an address into the kernel mapping | ||
| 170 | * of ISA space. So handle that here. | ||
| 171 | */ | ||
| 172 | if (addr >= phys_to_virt(ISA_START_ADDRESS) && | ||
| 173 | addr < phys_to_virt(ISA_END_ADDRESS)) | ||
| 174 | return; | ||
| 175 | |||
| 176 | addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); | ||
| 177 | |||
| 178 | /* Use the vm area unlocked, assuming the caller | ||
| 179 | ensures there isn't another iounmap for the same address | ||
| 180 | in parallel. Reuse of the virtual address is prevented by | ||
| 181 | leaving it in the global lists until we're done with it. | ||
| 182 | cpa takes care of the direct mappings. */ | ||
| 183 | read_lock(&vmlist_lock); | ||
| 184 | for (p = vmlist; p; p = p->next) { | ||
| 185 | if (p->addr == addr) | ||
| 186 | break; | ||
| 187 | } | ||
| 188 | read_unlock(&vmlist_lock); | ||
| 189 | |||
| 190 | if (!p) { | ||
| 191 | printk("iounmap: bad address %p\n", addr); | ||
| 192 | dump_stack(); | ||
| 193 | return; | ||
| 194 | } | ||
| 195 | |||
| 196 | /* Reset the direct mapping. Can block */ | ||
| 197 | if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) { | ||
| 198 | change_page_attr(virt_to_page(__va(p->phys_addr)), | ||
| 199 | get_vm_area_size(p) >> PAGE_SHIFT, | ||
| 200 | PAGE_KERNEL); | ||
| 201 | global_flush_tlb(); | ||
| 202 | } | ||
| 203 | |||
| 204 | /* Finally remove it */ | ||
| 205 | o = remove_vm_area((void *)addr); | ||
| 206 | BUG_ON(p != o || o == NULL); | ||
| 207 | kfree(p); | ||
| 208 | } | ||
| 209 | EXPORT_SYMBOL(iounmap); | ||
| 210 | |||
| 211 | void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) | ||
| 212 | { | ||
| 213 | unsigned long offset, last_addr; | ||
| 214 | unsigned int nrpages; | ||
| 215 | enum fixed_addresses idx; | ||
| 216 | |||
| 217 | /* Don't allow wraparound or zero size */ | ||
| 218 | last_addr = phys_addr + size - 1; | ||
| 219 | if (!size || last_addr < phys_addr) | ||
| 220 | return NULL; | ||
| 221 | |||
| 222 | /* | ||
| 223 | * Don't remap the low PCI/ISA area, it's always mapped.. | ||
| 224 | */ | ||
| 225 | if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) | ||
| 226 | return phys_to_virt(phys_addr); | ||
| 227 | |||
| 228 | /* | ||
| 229 | * Mappings have to be page-aligned | ||
| 230 | */ | ||
| 231 | offset = phys_addr & ~PAGE_MASK; | ||
| 232 | phys_addr &= PAGE_MASK; | ||
| 233 | size = PAGE_ALIGN(last_addr) - phys_addr; | ||
| 234 | |||
| 235 | /* | ||
| 236 | * Mappings have to fit in the FIX_BTMAP area. | ||
| 237 | */ | ||
| 238 | nrpages = size >> PAGE_SHIFT; | ||
| 239 | if (nrpages > NR_FIX_BTMAPS) | ||
| 240 | return NULL; | ||
| 241 | |||
| 242 | /* | ||
| 243 | * Ok, go for it.. | ||
| 244 | */ | ||
| 245 | idx = FIX_BTMAP_BEGIN; | ||
| 246 | while (nrpages > 0) { | ||
| 247 | set_fixmap(idx, phys_addr); | ||
| 248 | phys_addr += PAGE_SIZE; | ||
| 249 | --idx; | ||
| 250 | --nrpages; | ||
| 251 | } | ||
| 252 | return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); | ||
| 253 | } | ||
| 254 | |||
| 255 | void __init bt_iounmap(void *addr, unsigned long size) | ||
| 256 | { | ||
| 257 | unsigned long virt_addr; | ||
| 258 | unsigned long offset; | ||
| 259 | unsigned int nrpages; | ||
| 260 | enum fixed_addresses idx; | ||
| 261 | |||
| 262 | virt_addr = (unsigned long)addr; | ||
| 263 | if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) | ||
| 264 | return; | ||
| 265 | offset = virt_addr & ~PAGE_MASK; | ||
| 266 | nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; | ||
| 267 | |||
| 268 | idx = FIX_BTMAP_BEGIN; | ||
| 269 | while (nrpages > 0) { | ||
| 270 | clear_fixmap(idx); | ||
| 271 | --idx; | ||
| 272 | --nrpages; | ||
| 273 | } | ||
| 274 | } | ||
diff --git a/arch/x86/mm/mmap_32.c b/arch/x86/mm/mmap_32.c new file mode 100644 index 000000000000..552e08473755 --- /dev/null +++ b/arch/x86/mm/mmap_32.c | |||
| @@ -0,0 +1,77 @@ | |||
| 1 | /* | ||
| 2 | * linux/arch/i386/mm/mmap.c | ||
| 3 | * | ||
| 4 | * flexible mmap layout support | ||
| 5 | * | ||
| 6 | * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. | ||
| 7 | * All Rights Reserved. | ||
| 8 | * | ||
| 9 | * This program is free software; you can redistribute it and/or modify | ||
| 10 | * it under the terms of the GNU General Public License as published by | ||
| 11 | * the Free Software Foundation; either version 2 of the License, or | ||
| 12 | * (at your option) any later version. | ||
| 13 | * | ||
| 14 | * This program is distributed in the hope that it will be useful, | ||
| 15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 17 | * GNU General Public License for more details. | ||
| 18 | * | ||
| 19 | * You should have received a copy of the GNU General Public License | ||
| 20 | * along with this program; if not, write to the Free Software | ||
| 21 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 22 | * | ||
| 23 | * | ||
| 24 | * Started by Ingo Molnar <mingo@elte.hu> | ||
| 25 | */ | ||
| 26 | |||
| 27 | #include <linux/personality.h> | ||
| 28 | #include <linux/mm.h> | ||
| 29 | #include <linux/random.h> | ||
| 30 | #include <linux/sched.h> | ||
| 31 | |||
| 32 | /* | ||
| 33 | * Top of mmap area (just below the process stack). | ||
| 34 | * | ||
| 35 | * Leave an at least ~128 MB hole. | ||
| 36 | */ | ||
| 37 | #define MIN_GAP (128*1024*1024) | ||
| 38 | #define MAX_GAP (TASK_SIZE/6*5) | ||
| 39 | |||
| 40 | static inline unsigned long mmap_base(struct mm_struct *mm) | ||
| 41 | { | ||
| 42 | unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; | ||
| 43 | unsigned long random_factor = 0; | ||
| 44 | |||
| 45 | if (current->flags & PF_RANDOMIZE) | ||
| 46 | random_factor = get_random_int() % (1024*1024); | ||
| 47 | |||
| 48 | if (gap < MIN_GAP) | ||
| 49 | gap = MIN_GAP; | ||
| 50 | else if (gap > MAX_GAP) | ||
| 51 | gap = MAX_GAP; | ||
| 52 | |||
| 53 | return PAGE_ALIGN(TASK_SIZE - gap - random_factor); | ||
| 54 | } | ||
| 55 | |||
| 56 | /* | ||
| 57 | * This function, called very early during the creation of a new | ||
| 58 | * process VM image, sets up which VM layout function to use: | ||
| 59 | */ | ||
| 60 | void arch_pick_mmap_layout(struct mm_struct *mm) | ||
| 61 | { | ||
| 62 | /* | ||
| 63 | * Fall back to the standard layout if the personality | ||
| 64 | * bit is set, or if the expected stack growth is unlimited: | ||
| 65 | */ | ||
| 66 | if (sysctl_legacy_va_layout || | ||
| 67 | (current->personality & ADDR_COMPAT_LAYOUT) || | ||
| 68 | current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) { | ||
| 69 | mm->mmap_base = TASK_UNMAPPED_BASE; | ||
| 70 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
| 71 | mm->unmap_area = arch_unmap_area; | ||
| 72 | } else { | ||
| 73 | mm->mmap_base = mmap_base(mm); | ||
| 74 | mm->get_unmapped_area = arch_get_unmapped_area_topdown; | ||
| 75 | mm->unmap_area = arch_unmap_area_topdown; | ||
| 76 | } | ||
| 77 | } | ||
diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c new file mode 100644 index 000000000000..4241a74d16c8 --- /dev/null +++ b/arch/x86/mm/pageattr_32.c | |||
| @@ -0,0 +1,278 @@ | |||
| 1 | /* | ||
| 2 | * Copyright 2002 Andi Kleen, SuSE Labs. | ||
| 3 | * Thanks to Ben LaHaise for precious feedback. | ||
| 4 | */ | ||
| 5 | |||
| 6 | #include <linux/mm.h> | ||
| 7 | #include <linux/sched.h> | ||
| 8 | #include <linux/highmem.h> | ||
| 9 | #include <linux/module.h> | ||
| 10 | #include <linux/slab.h> | ||
| 11 | #include <asm/uaccess.h> | ||
| 12 | #include <asm/processor.h> | ||
| 13 | #include <asm/tlbflush.h> | ||
| 14 | #include <asm/pgalloc.h> | ||
| 15 | #include <asm/sections.h> | ||
| 16 | |||
| 17 | static DEFINE_SPINLOCK(cpa_lock); | ||
| 18 | static struct list_head df_list = LIST_HEAD_INIT(df_list); | ||
| 19 | |||
| 20 | |||
| 21 | pte_t *lookup_address(unsigned long address) | ||
| 22 | { | ||
| 23 | pgd_t *pgd = pgd_offset_k(address); | ||
| 24 | pud_t *pud; | ||
| 25 | pmd_t *pmd; | ||
| 26 | if (pgd_none(*pgd)) | ||
| 27 | return NULL; | ||
| 28 | pud = pud_offset(pgd, address); | ||
| 29 | if (pud_none(*pud)) | ||
| 30 | return NULL; | ||
| 31 | pmd = pmd_offset(pud, address); | ||
| 32 | if (pmd_none(*pmd)) | ||
| 33 | return NULL; | ||
| 34 | if (pmd_large(*pmd)) | ||
| 35 | return (pte_t *)pmd; | ||
| 36 | return pte_offset_kernel(pmd, address); | ||
| 37 | } | ||
| 38 | |||
| 39 | static struct page *split_large_page(unsigned long address, pgprot_t prot, | ||
| 40 | pgprot_t ref_prot) | ||
| 41 | { | ||
| 42 | int i; | ||
| 43 | unsigned long addr; | ||
| 44 | struct page *base; | ||
| 45 | pte_t *pbase; | ||
| 46 | |||
| 47 | spin_unlock_irq(&cpa_lock); | ||
| 48 | base = alloc_pages(GFP_KERNEL, 0); | ||
| 49 | spin_lock_irq(&cpa_lock); | ||
| 50 | if (!base) | ||
| 51 | return NULL; | ||
| 52 | |||
| 53 | /* | ||
| 54 | * page_private is used to track the number of entries in | ||
| 55 | * the page table page that have non standard attributes. | ||
| 56 | */ | ||
| 57 | SetPagePrivate(base); | ||
| 58 | page_private(base) = 0; | ||
| 59 | |||
| 60 | address = __pa(address); | ||
| 61 | addr = address & LARGE_PAGE_MASK; | ||
| 62 | pbase = (pte_t *)page_address(base); | ||
| 63 | paravirt_alloc_pt(&init_mm, page_to_pfn(base)); | ||
| 64 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { | ||
| 65 | set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, | ||
| 66 | addr == address ? prot : ref_prot)); | ||
| 67 | } | ||
| 68 | return base; | ||
| 69 | } | ||
| 70 | |||
| 71 | static void cache_flush_page(struct page *p) | ||
| 72 | { | ||
| 73 | unsigned long adr = (unsigned long)page_address(p); | ||
| 74 | int i; | ||
| 75 | for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) | ||
| 76 | asm volatile("clflush (%0)" :: "r" (adr + i)); | ||
| 77 | } | ||
| 78 | |||
| 79 | static void flush_kernel_map(void *arg) | ||
| 80 | { | ||
| 81 | struct list_head *lh = (struct list_head *)arg; | ||
| 82 | struct page *p; | ||
| 83 | |||
| 84 | /* High level code is not ready for clflush yet */ | ||
| 85 | if (0 && cpu_has_clflush) { | ||
| 86 | list_for_each_entry (p, lh, lru) | ||
| 87 | cache_flush_page(p); | ||
| 88 | } else if (boot_cpu_data.x86_model >= 4) | ||
| 89 | wbinvd(); | ||
| 90 | |||
| 91 | /* Flush all to work around Errata in early athlons regarding | ||
| 92 | * large page flushing. | ||
| 93 | */ | ||
| 94 | __flush_tlb_all(); | ||
| 95 | } | ||
| 96 | |||
| 97 | static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) | ||
| 98 | { | ||
| 99 | struct page *page; | ||
| 100 | unsigned long flags; | ||
| 101 | |||
| 102 | set_pte_atomic(kpte, pte); /* change init_mm */ | ||
| 103 | if (SHARED_KERNEL_PMD) | ||
| 104 | return; | ||
| 105 | |||
| 106 | spin_lock_irqsave(&pgd_lock, flags); | ||
| 107 | for (page = pgd_list; page; page = (struct page *)page->index) { | ||
| 108 | pgd_t *pgd; | ||
| 109 | pud_t *pud; | ||
| 110 | pmd_t *pmd; | ||
| 111 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | ||
| 112 | pud = pud_offset(pgd, address); | ||
| 113 | pmd = pmd_offset(pud, address); | ||
| 114 | set_pte_atomic((pte_t *)pmd, pte); | ||
| 115 | } | ||
| 116 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
| 117 | } | ||
| 118 | |||
| 119 | /* | ||
| 120 | * No more special protections in this 2/4MB area - revert to a | ||
| 121 | * large page again. | ||
| 122 | */ | ||
| 123 | static inline void revert_page(struct page *kpte_page, unsigned long address) | ||
| 124 | { | ||
| 125 | pgprot_t ref_prot; | ||
| 126 | pte_t *linear; | ||
| 127 | |||
| 128 | ref_prot = | ||
| 129 | ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext) | ||
| 130 | ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE; | ||
| 131 | |||
| 132 | linear = (pte_t *) | ||
| 133 | pmd_offset(pud_offset(pgd_offset_k(address), address), address); | ||
| 134 | set_pmd_pte(linear, address, | ||
| 135 | pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, | ||
| 136 | ref_prot)); | ||
| 137 | } | ||
| 138 | |||
| 139 | static inline void save_page(struct page *kpte_page) | ||
| 140 | { | ||
| 141 | if (!test_and_set_bit(PG_arch_1, &kpte_page->flags)) | ||
| 142 | list_add(&kpte_page->lru, &df_list); | ||
| 143 | } | ||
| 144 | |||
| 145 | static int | ||
| 146 | __change_page_attr(struct page *page, pgprot_t prot) | ||
| 147 | { | ||
| 148 | pte_t *kpte; | ||
| 149 | unsigned long address; | ||
| 150 | struct page *kpte_page; | ||
| 151 | |||
| 152 | BUG_ON(PageHighMem(page)); | ||
| 153 | address = (unsigned long)page_address(page); | ||
| 154 | |||
| 155 | kpte = lookup_address(address); | ||
| 156 | if (!kpte) | ||
| 157 | return -EINVAL; | ||
| 158 | kpte_page = virt_to_page(kpte); | ||
| 159 | BUG_ON(PageLRU(kpte_page)); | ||
| 160 | BUG_ON(PageCompound(kpte_page)); | ||
| 161 | |||
| 162 | if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { | ||
| 163 | if (!pte_huge(*kpte)) { | ||
| 164 | set_pte_atomic(kpte, mk_pte(page, prot)); | ||
| 165 | } else { | ||
| 166 | pgprot_t ref_prot; | ||
| 167 | struct page *split; | ||
| 168 | |||
| 169 | ref_prot = | ||
| 170 | ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext) | ||
| 171 | ? PAGE_KERNEL_EXEC : PAGE_KERNEL; | ||
| 172 | split = split_large_page(address, prot, ref_prot); | ||
| 173 | if (!split) | ||
| 174 | return -ENOMEM; | ||
| 175 | set_pmd_pte(kpte,address,mk_pte(split, ref_prot)); | ||
| 176 | kpte_page = split; | ||
| 177 | } | ||
| 178 | page_private(kpte_page)++; | ||
| 179 | } else if (!pte_huge(*kpte)) { | ||
| 180 | set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); | ||
| 181 | BUG_ON(page_private(kpte_page) == 0); | ||
| 182 | page_private(kpte_page)--; | ||
| 183 | } else | ||
| 184 | BUG(); | ||
| 185 | |||
| 186 | /* | ||
| 187 | * If the pte was reserved, it means it was created at boot | ||
| 188 | * time (not via split_large_page) and in turn we must not | ||
| 189 | * replace it with a largepage. | ||
| 190 | */ | ||
| 191 | |||
| 192 | save_page(kpte_page); | ||
| 193 | if (!PageReserved(kpte_page)) { | ||
| 194 | if (cpu_has_pse && (page_private(kpte_page) == 0)) { | ||
| 195 | paravirt_release_pt(page_to_pfn(kpte_page)); | ||
| 196 | revert_page(kpte_page, address); | ||
| 197 | } | ||
| 198 | } | ||
| 199 | return 0; | ||
| 200 | } | ||
| 201 | |||
| 202 | static inline void flush_map(struct list_head *l) | ||
| 203 | { | ||
| 204 | on_each_cpu(flush_kernel_map, l, 1, 1); | ||
| 205 | } | ||
| 206 | |||
| 207 | /* | ||
| 208 | * Change the page attributes of an page in the linear mapping. | ||
| 209 | * | ||
| 210 | * This should be used when a page is mapped with a different caching policy | ||
| 211 | * than write-back somewhere - some CPUs do not like it when mappings with | ||
| 212 | * different caching policies exist. This changes the page attributes of the | ||
| 213 | * in kernel linear mapping too. | ||
| 214 | * | ||
| 215 | * The caller needs to ensure that there are no conflicting mappings elsewhere. | ||
| 216 | * This function only deals with the kernel linear map. | ||
| 217 | * | ||
| 218 | * Caller must call global_flush_tlb() after this. | ||
| 219 | */ | ||
| 220 | int change_page_attr(struct page *page, int numpages, pgprot_t prot) | ||
| 221 | { | ||
| 222 | int err = 0; | ||
| 223 | int i; | ||
| 224 | unsigned long flags; | ||
| 225 | |||
| 226 | spin_lock_irqsave(&cpa_lock, flags); | ||
| 227 | for (i = 0; i < numpages; i++, page++) { | ||
| 228 | err = __change_page_attr(page, prot); | ||
| 229 | if (err) | ||
| 230 | break; | ||
| 231 | } | ||
| 232 | spin_unlock_irqrestore(&cpa_lock, flags); | ||
| 233 | return err; | ||
| 234 | } | ||
| 235 | |||
| 236 | void global_flush_tlb(void) | ||
| 237 | { | ||
| 238 | struct list_head l; | ||
| 239 | struct page *pg, *next; | ||
| 240 | |||
| 241 | BUG_ON(irqs_disabled()); | ||
| 242 | |||
| 243 | spin_lock_irq(&cpa_lock); | ||
| 244 | list_replace_init(&df_list, &l); | ||
| 245 | spin_unlock_irq(&cpa_lock); | ||
| 246 | flush_map(&l); | ||
| 247 | list_for_each_entry_safe(pg, next, &l, lru) { | ||
| 248 | list_del(&pg->lru); | ||
| 249 | clear_bit(PG_arch_1, &pg->flags); | ||
| 250 | if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0) | ||
| 251 | continue; | ||
| 252 | ClearPagePrivate(pg); | ||
| 253 | __free_page(pg); | ||
| 254 | } | ||
| 255 | } | ||
| 256 | |||
| 257 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 258 | void kernel_map_pages(struct page *page, int numpages, int enable) | ||
| 259 | { | ||
| 260 | if (PageHighMem(page)) | ||
| 261 | return; | ||
| 262 | if (!enable) | ||
| 263 | debug_check_no_locks_freed(page_address(page), | ||
| 264 | numpages * PAGE_SIZE); | ||
| 265 | |||
| 266 | /* the return value is ignored - the calls cannot fail, | ||
| 267 | * large pages are disabled at boot time. | ||
| 268 | */ | ||
| 269 | change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); | ||
| 270 | /* we should perform an IPI and flush all tlbs, | ||
| 271 | * but that can deadlock->flush only current cpu. | ||
| 272 | */ | ||
| 273 | __flush_tlb_all(); | ||
| 274 | } | ||
| 275 | #endif | ||
| 276 | |||
| 277 | EXPORT_SYMBOL(change_page_attr); | ||
| 278 | EXPORT_SYMBOL(global_flush_tlb); | ||
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c new file mode 100644 index 000000000000..01437c46baae --- /dev/null +++ b/arch/x86/mm/pgtable_32.c | |||
| @@ -0,0 +1,373 @@ | |||
| 1 | /* | ||
| 2 | * linux/arch/i386/mm/pgtable.c | ||
| 3 | */ | ||
| 4 | |||
| 5 | #include <linux/sched.h> | ||
| 6 | #include <linux/kernel.h> | ||
| 7 | #include <linux/errno.h> | ||
| 8 | #include <linux/mm.h> | ||
| 9 | #include <linux/swap.h> | ||
| 10 | #include <linux/smp.h> | ||
| 11 | #include <linux/highmem.h> | ||
| 12 | #include <linux/slab.h> | ||
| 13 | #include <linux/pagemap.h> | ||
| 14 | #include <linux/spinlock.h> | ||
| 15 | #include <linux/module.h> | ||
| 16 | #include <linux/quicklist.h> | ||
| 17 | |||
| 18 | #include <asm/system.h> | ||
| 19 | #include <asm/pgtable.h> | ||
| 20 | #include <asm/pgalloc.h> | ||
| 21 | #include <asm/fixmap.h> | ||
| 22 | #include <asm/e820.h> | ||
| 23 | #include <asm/tlb.h> | ||
| 24 | #include <asm/tlbflush.h> | ||
| 25 | |||
| 26 | void show_mem(void) | ||
| 27 | { | ||
| 28 | int total = 0, reserved = 0; | ||
| 29 | int shared = 0, cached = 0; | ||
| 30 | int highmem = 0; | ||
| 31 | struct page *page; | ||
| 32 | pg_data_t *pgdat; | ||
| 33 | unsigned long i; | ||
| 34 | unsigned long flags; | ||
| 35 | |||
| 36 | printk(KERN_INFO "Mem-info:\n"); | ||
| 37 | show_free_areas(); | ||
| 38 | printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); | ||
| 39 | for_each_online_pgdat(pgdat) { | ||
| 40 | pgdat_resize_lock(pgdat, &flags); | ||
| 41 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | ||
| 42 | page = pgdat_page_nr(pgdat, i); | ||
| 43 | total++; | ||
| 44 | if (PageHighMem(page)) | ||
| 45 | highmem++; | ||
| 46 | if (PageReserved(page)) | ||
| 47 | reserved++; | ||
| 48 | else if (PageSwapCache(page)) | ||
| 49 | cached++; | ||
| 50 | else if (page_count(page)) | ||
| 51 | shared += page_count(page) - 1; | ||
| 52 | } | ||
| 53 | pgdat_resize_unlock(pgdat, &flags); | ||
| 54 | } | ||
| 55 | printk(KERN_INFO "%d pages of RAM\n", total); | ||
| 56 | printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); | ||
| 57 | printk(KERN_INFO "%d reserved pages\n", reserved); | ||
| 58 | printk(KERN_INFO "%d pages shared\n", shared); | ||
| 59 | printk(KERN_INFO "%d pages swap cached\n", cached); | ||
| 60 | |||
| 61 | printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY)); | ||
| 62 | printk(KERN_INFO "%lu pages writeback\n", | ||
| 63 | global_page_state(NR_WRITEBACK)); | ||
| 64 | printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED)); | ||
| 65 | printk(KERN_INFO "%lu pages slab\n", | ||
| 66 | global_page_state(NR_SLAB_RECLAIMABLE) + | ||
| 67 | global_page_state(NR_SLAB_UNRECLAIMABLE)); | ||
| 68 | printk(KERN_INFO "%lu pages pagetables\n", | ||
| 69 | global_page_state(NR_PAGETABLE)); | ||
| 70 | } | ||
| 71 | |||
| 72 | /* | ||
| 73 | * Associate a virtual page frame with a given physical page frame | ||
| 74 | * and protection flags for that frame. | ||
| 75 | */ | ||
| 76 | static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) | ||
| 77 | { | ||
| 78 | pgd_t *pgd; | ||
| 79 | pud_t *pud; | ||
| 80 | pmd_t *pmd; | ||
| 81 | pte_t *pte; | ||
| 82 | |||
| 83 | pgd = swapper_pg_dir + pgd_index(vaddr); | ||
| 84 | if (pgd_none(*pgd)) { | ||
| 85 | BUG(); | ||
| 86 | return; | ||
| 87 | } | ||
| 88 | pud = pud_offset(pgd, vaddr); | ||
| 89 | if (pud_none(*pud)) { | ||
| 90 | BUG(); | ||
| 91 | return; | ||
| 92 | } | ||
| 93 | pmd = pmd_offset(pud, vaddr); | ||
| 94 | if (pmd_none(*pmd)) { | ||
| 95 | BUG(); | ||
| 96 | return; | ||
| 97 | } | ||
| 98 | pte = pte_offset_kernel(pmd, vaddr); | ||
| 99 | if (pgprot_val(flags)) | ||
| 100 | /* <pfn,flags> stored as-is, to permit clearing entries */ | ||
| 101 | set_pte(pte, pfn_pte(pfn, flags)); | ||
| 102 | else | ||
| 103 | pte_clear(&init_mm, vaddr, pte); | ||
| 104 | |||
| 105 | /* | ||
| 106 | * It's enough to flush this one mapping. | ||
| 107 | * (PGE mappings get flushed as well) | ||
| 108 | */ | ||
| 109 | __flush_tlb_one(vaddr); | ||
| 110 | } | ||
| 111 | |||
| 112 | /* | ||
| 113 | * Associate a large virtual page frame with a given physical page frame | ||
| 114 | * and protection flags for that frame. pfn is for the base of the page, | ||
| 115 | * vaddr is what the page gets mapped to - both must be properly aligned. | ||
| 116 | * The pmd must already be instantiated. Assumes PAE mode. | ||
| 117 | */ | ||
| 118 | void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) | ||
| 119 | { | ||
| 120 | pgd_t *pgd; | ||
| 121 | pud_t *pud; | ||
| 122 | pmd_t *pmd; | ||
| 123 | |||
| 124 | if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ | ||
| 125 | printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n"); | ||
| 126 | return; /* BUG(); */ | ||
| 127 | } | ||
| 128 | if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ | ||
| 129 | printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n"); | ||
| 130 | return; /* BUG(); */ | ||
| 131 | } | ||
| 132 | pgd = swapper_pg_dir + pgd_index(vaddr); | ||
| 133 | if (pgd_none(*pgd)) { | ||
| 134 | printk(KERN_WARNING "set_pmd_pfn: pgd_none\n"); | ||
| 135 | return; /* BUG(); */ | ||
| 136 | } | ||
| 137 | pud = pud_offset(pgd, vaddr); | ||
| 138 | pmd = pmd_offset(pud, vaddr); | ||
| 139 | set_pmd(pmd, pfn_pmd(pfn, flags)); | ||
| 140 | /* | ||
| 141 | * It's enough to flush this one mapping. | ||
| 142 | * (PGE mappings get flushed as well) | ||
| 143 | */ | ||
| 144 | __flush_tlb_one(vaddr); | ||
| 145 | } | ||
| 146 | |||
| 147 | static int fixmaps; | ||
| 148 | unsigned long __FIXADDR_TOP = 0xfffff000; | ||
| 149 | EXPORT_SYMBOL(__FIXADDR_TOP); | ||
| 150 | |||
| 151 | void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) | ||
| 152 | { | ||
| 153 | unsigned long address = __fix_to_virt(idx); | ||
| 154 | |||
| 155 | if (idx >= __end_of_fixed_addresses) { | ||
| 156 | BUG(); | ||
| 157 | return; | ||
| 158 | } | ||
| 159 | set_pte_pfn(address, phys >> PAGE_SHIFT, flags); | ||
| 160 | fixmaps++; | ||
| 161 | } | ||
| 162 | |||
| 163 | /** | ||
| 164 | * reserve_top_address - reserves a hole in the top of kernel address space | ||
| 165 | * @reserve - size of hole to reserve | ||
| 166 | * | ||
| 167 | * Can be used to relocate the fixmap area and poke a hole in the top | ||
| 168 | * of kernel address space to make room for a hypervisor. | ||
| 169 | */ | ||
| 170 | void reserve_top_address(unsigned long reserve) | ||
| 171 | { | ||
| 172 | BUG_ON(fixmaps > 0); | ||
| 173 | printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", | ||
| 174 | (int)-reserve); | ||
| 175 | __FIXADDR_TOP = -reserve - PAGE_SIZE; | ||
| 176 | __VMALLOC_RESERVE += reserve; | ||
| 177 | } | ||
| 178 | |||
| 179 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) | ||
| 180 | { | ||
| 181 | return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); | ||
| 182 | } | ||
| 183 | |||
| 184 | struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) | ||
| 185 | { | ||
| 186 | struct page *pte; | ||
| 187 | |||
| 188 | #ifdef CONFIG_HIGHPTE | ||
| 189 | pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); | ||
| 190 | #else | ||
| 191 | pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); | ||
| 192 | #endif | ||
| 193 | return pte; | ||
| 194 | } | ||
| 195 | |||
| 196 | void pmd_ctor(void *pmd, struct kmem_cache *cache, unsigned long flags) | ||
| 197 | { | ||
| 198 | memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); | ||
| 199 | } | ||
| 200 | |||
| 201 | /* | ||
| 202 | * List of all pgd's needed for non-PAE so it can invalidate entries | ||
| 203 | * in both cached and uncached pgd's; not needed for PAE since the | ||
| 204 | * kernel pmd is shared. If PAE were not to share the pmd a similar | ||
| 205 | * tactic would be needed. This is essentially codepath-based locking | ||
| 206 | * against pageattr.c; it is the unique case in which a valid change | ||
| 207 | * of kernel pagetables can't be lazily synchronized by vmalloc faults. | ||
| 208 | * vmalloc faults work because attached pagetables are never freed. | ||
| 209 | * -- wli | ||
| 210 | */ | ||
| 211 | DEFINE_SPINLOCK(pgd_lock); | ||
| 212 | struct page *pgd_list; | ||
| 213 | |||
| 214 | static inline void pgd_list_add(pgd_t *pgd) | ||
| 215 | { | ||
| 216 | struct page *page = virt_to_page(pgd); | ||
| 217 | page->index = (unsigned long)pgd_list; | ||
| 218 | if (pgd_list) | ||
| 219 | set_page_private(pgd_list, (unsigned long)&page->index); | ||
| 220 | pgd_list = page; | ||
| 221 | set_page_private(page, (unsigned long)&pgd_list); | ||
| 222 | } | ||
| 223 | |||
| 224 | static inline void pgd_list_del(pgd_t *pgd) | ||
| 225 | { | ||
| 226 | struct page *next, **pprev, *page = virt_to_page(pgd); | ||
| 227 | next = (struct page *)page->index; | ||
| 228 | pprev = (struct page **)page_private(page); | ||
| 229 | *pprev = next; | ||
| 230 | if (next) | ||
| 231 | set_page_private(next, (unsigned long)pprev); | ||
| 232 | } | ||
| 233 | |||
| 234 | |||
| 235 | |||
| 236 | #if (PTRS_PER_PMD == 1) | ||
| 237 | /* Non-PAE pgd constructor */ | ||
| 238 | static void pgd_ctor(void *pgd) | ||
| 239 | { | ||
| 240 | unsigned long flags; | ||
| 241 | |||
| 242 | /* !PAE, no pagetable sharing */ | ||
| 243 | memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); | ||
| 244 | |||
| 245 | spin_lock_irqsave(&pgd_lock, flags); | ||
| 246 | |||
| 247 | /* must happen under lock */ | ||
| 248 | clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, | ||
| 249 | swapper_pg_dir + USER_PTRS_PER_PGD, | ||
| 250 | KERNEL_PGD_PTRS); | ||
| 251 | paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, | ||
| 252 | __pa(swapper_pg_dir) >> PAGE_SHIFT, | ||
| 253 | USER_PTRS_PER_PGD, | ||
| 254 | KERNEL_PGD_PTRS); | ||
| 255 | pgd_list_add(pgd); | ||
| 256 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
| 257 | } | ||
| 258 | #else /* PTRS_PER_PMD > 1 */ | ||
| 259 | /* PAE pgd constructor */ | ||
| 260 | static void pgd_ctor(void *pgd) | ||
| 261 | { | ||
| 262 | /* PAE, kernel PMD may be shared */ | ||
| 263 | |||
| 264 | if (SHARED_KERNEL_PMD) { | ||
| 265 | clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, | ||
| 266 | swapper_pg_dir + USER_PTRS_PER_PGD, | ||
| 267 | KERNEL_PGD_PTRS); | ||
| 268 | } else { | ||
| 269 | unsigned long flags; | ||
| 270 | |||
| 271 | memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); | ||
| 272 | spin_lock_irqsave(&pgd_lock, flags); | ||
| 273 | pgd_list_add(pgd); | ||
| 274 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
| 275 | } | ||
| 276 | } | ||
| 277 | #endif /* PTRS_PER_PMD */ | ||
| 278 | |||
| 279 | static void pgd_dtor(void *pgd) | ||
| 280 | { | ||
| 281 | unsigned long flags; /* can be called from interrupt context */ | ||
| 282 | |||
| 283 | if (SHARED_KERNEL_PMD) | ||
| 284 | return; | ||
| 285 | |||
| 286 | paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); | ||
| 287 | spin_lock_irqsave(&pgd_lock, flags); | ||
| 288 | pgd_list_del(pgd); | ||
| 289 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
| 290 | } | ||
| 291 | |||
| 292 | #define UNSHARED_PTRS_PER_PGD \ | ||
| 293 | (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) | ||
| 294 | |||
| 295 | /* If we allocate a pmd for part of the kernel address space, then | ||
| 296 | make sure its initialized with the appropriate kernel mappings. | ||
| 297 | Otherwise use a cached zeroed pmd. */ | ||
| 298 | static pmd_t *pmd_cache_alloc(int idx) | ||
| 299 | { | ||
| 300 | pmd_t *pmd; | ||
| 301 | |||
| 302 | if (idx >= USER_PTRS_PER_PGD) { | ||
| 303 | pmd = (pmd_t *)__get_free_page(GFP_KERNEL); | ||
| 304 | |||
| 305 | if (pmd) | ||
| 306 | memcpy(pmd, | ||
| 307 | (void *)pgd_page_vaddr(swapper_pg_dir[idx]), | ||
| 308 | sizeof(pmd_t) * PTRS_PER_PMD); | ||
| 309 | } else | ||
| 310 | pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); | ||
| 311 | |||
| 312 | return pmd; | ||
| 313 | } | ||
| 314 | |||
| 315 | static void pmd_cache_free(pmd_t *pmd, int idx) | ||
| 316 | { | ||
| 317 | if (idx >= USER_PTRS_PER_PGD) | ||
| 318 | free_page((unsigned long)pmd); | ||
| 319 | else | ||
| 320 | kmem_cache_free(pmd_cache, pmd); | ||
| 321 | } | ||
| 322 | |||
| 323 | pgd_t *pgd_alloc(struct mm_struct *mm) | ||
| 324 | { | ||
| 325 | int i; | ||
| 326 | pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); | ||
| 327 | |||
| 328 | if (PTRS_PER_PMD == 1 || !pgd) | ||
| 329 | return pgd; | ||
| 330 | |||
| 331 | for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { | ||
| 332 | pmd_t *pmd = pmd_cache_alloc(i); | ||
| 333 | |||
| 334 | if (!pmd) | ||
| 335 | goto out_oom; | ||
| 336 | |||
| 337 | paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); | ||
| 338 | set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); | ||
| 339 | } | ||
| 340 | return pgd; | ||
| 341 | |||
| 342 | out_oom: | ||
| 343 | for (i--; i >= 0; i--) { | ||
| 344 | pgd_t pgdent = pgd[i]; | ||
| 345 | void* pmd = (void *)__va(pgd_val(pgdent)-1); | ||
| 346 | paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); | ||
| 347 | pmd_cache_free(pmd, i); | ||
| 348 | } | ||
| 349 | quicklist_free(0, pgd_dtor, pgd); | ||
| 350 | return NULL; | ||
| 351 | } | ||
| 352 | |||
| 353 | void pgd_free(pgd_t *pgd) | ||
| 354 | { | ||
| 355 | int i; | ||
| 356 | |||
| 357 | /* in the PAE case user pgd entries are overwritten before usage */ | ||
| 358 | if (PTRS_PER_PMD > 1) | ||
| 359 | for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { | ||
| 360 | pgd_t pgdent = pgd[i]; | ||
| 361 | void* pmd = (void *)__va(pgd_val(pgdent)-1); | ||
| 362 | paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); | ||
| 363 | pmd_cache_free(pmd, i); | ||
| 364 | } | ||
| 365 | /* in the non-PAE case, free_pgtables() clears user pgd entries */ | ||
| 366 | quicklist_free(0, pgd_dtor, pgd); | ||
| 367 | } | ||
| 368 | |||
| 369 | void check_pgt_cache(void) | ||
| 370 | { | ||
| 371 | quicklist_trim(0, pgd_dtor, 25, 16); | ||
| 372 | } | ||
| 373 | |||
