diff options
Diffstat (limited to 'arch/x86/mm')
-rw-r--r-- | arch/x86/mm/Makefile | 11 | ||||
-rw-r--r-- | arch/x86/mm/discontig_32.c | 288 | ||||
-rw-r--r-- | arch/x86/mm/dump_pagetables.c | 12 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 123 | ||||
-rw-r--r-- | arch/x86/mm/gup.c | 298 | ||||
-rw-r--r-- | arch/x86/mm/hugetlbpage.c | 78 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 600 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 817 | ||||
-rw-r--r-- | arch/x86/mm/ioremap.c | 103 | ||||
-rw-r--r-- | arch/x86/mm/k8topology_64.c | 21 | ||||
-rw-r--r-- | arch/x86/mm/kmmio.c | 510 | ||||
-rw-r--r-- | arch/x86/mm/memtest.c | 123 | ||||
-rw-r--r-- | arch/x86/mm/mmio-mod.c | 517 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 109 | ||||
-rw-r--r-- | arch/x86/mm/pageattr-test.c | 27 | ||||
-rw-r--r-- | arch/x86/mm/pageattr.c | 541 | ||||
-rw-r--r-- | arch/x86/mm/pat.c | 612 | ||||
-rw-r--r-- | arch/x86/mm/pf_in.c | 489 | ||||
-rw-r--r-- | arch/x86/mm/pf_in.h | 39 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 199 | ||||
-rw-r--r-- | arch/x86/mm/pgtable_32.c | 104 | ||||
-rw-r--r-- | arch/x86/mm/srat_32.c | 283 | ||||
-rw-r--r-- | arch/x86/mm/srat_64.c | 21 | ||||
-rw-r--r-- | arch/x86/mm/testmmiotrace.c | 71 |
24 files changed, 4612 insertions, 1384 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index b7b3e4c7cfc9..dfb932dcf136 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ | 1 | obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ |
2 | pat.o pgtable.o | 2 | pat.o pgtable.o gup.o |
3 | 3 | ||
4 | obj-$(CONFIG_X86_32) += pgtable_32.o | 4 | obj-$(CONFIG_X86_32) += pgtable_32.o |
5 | 5 | ||
@@ -8,10 +8,17 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o | |||
8 | 8 | ||
9 | obj-$(CONFIG_HIGHMEM) += highmem_32.o | 9 | obj-$(CONFIG_HIGHMEM) += highmem_32.o |
10 | 10 | ||
11 | obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o | ||
12 | obj-$(CONFIG_MMIOTRACE) += mmiotrace.o | ||
13 | mmiotrace-y := pf_in.o mmio-mod.o | ||
14 | obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o | ||
15 | |||
11 | ifeq ($(CONFIG_X86_32),y) | 16 | ifeq ($(CONFIG_X86_32),y) |
12 | obj-$(CONFIG_NUMA) += discontig_32.o | 17 | obj-$(CONFIG_NUMA) += discontig_32.o |
13 | else | 18 | else |
14 | obj-$(CONFIG_NUMA) += numa_64.o | 19 | obj-$(CONFIG_NUMA) += numa_64.o |
15 | obj-$(CONFIG_K8_NUMA) += k8topology_64.o | 20 | obj-$(CONFIG_K8_NUMA) += k8topology_64.o |
16 | obj-$(CONFIG_ACPI_NUMA) += srat_64.o | ||
17 | endif | 21 | endif |
22 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o | ||
23 | |||
24 | obj-$(CONFIG_MEMTEST) += memtest.o | ||
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 914ccf983687..847c164725f4 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c | |||
@@ -38,10 +38,10 @@ | |||
38 | #include <asm/setup.h> | 38 | #include <asm/setup.h> |
39 | #include <asm/mmzone.h> | 39 | #include <asm/mmzone.h> |
40 | #include <asm/bios_ebda.h> | 40 | #include <asm/bios_ebda.h> |
41 | #include <asm/proto.h> | ||
41 | 42 | ||
42 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | 43 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
43 | EXPORT_SYMBOL(node_data); | 44 | EXPORT_SYMBOL(node_data); |
44 | static bootmem_data_t node0_bdata; | ||
45 | 45 | ||
46 | /* | 46 | /* |
47 | * numa interface - we expect the numa architecture specific code to have | 47 | * numa interface - we expect the numa architecture specific code to have |
@@ -59,14 +59,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; | |||
59 | /* | 59 | /* |
60 | * 4) physnode_map - the mapping between a pfn and owning node | 60 | * 4) physnode_map - the mapping between a pfn and owning node |
61 | * physnode_map keeps track of the physical memory layout of a generic | 61 | * physnode_map keeps track of the physical memory layout of a generic |
62 | * numa node on a 256Mb break (each element of the array will | 62 | * numa node on a 64Mb break (each element of the array will |
63 | * represent 256Mb of memory and will be marked by the node id. so, | 63 | * represent 64Mb of memory and will be marked by the node id. so, |
64 | * if the first gig is on node 0, and the second gig is on node 1 | 64 | * if the first gig is on node 0, and the second gig is on node 1 |
65 | * physnode_map will contain: | 65 | * physnode_map will contain: |
66 | * | 66 | * |
67 | * physnode_map[0-3] = 0; | 67 | * physnode_map[0-15] = 0; |
68 | * physnode_map[4-7] = 1; | 68 | * physnode_map[16-31] = 1; |
69 | * physnode_map[8- ] = -1; | 69 | * physnode_map[32- ] = -1; |
70 | */ | 70 | */ |
71 | s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; | 71 | s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; |
72 | EXPORT_SYMBOL(physnode_map); | 72 | EXPORT_SYMBOL(physnode_map); |
@@ -75,15 +75,15 @@ void memory_present(int nid, unsigned long start, unsigned long end) | |||
75 | { | 75 | { |
76 | unsigned long pfn; | 76 | unsigned long pfn; |
77 | 77 | ||
78 | printk(KERN_INFO "Node: %d, start_pfn: %ld, end_pfn: %ld\n", | 78 | printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", |
79 | nid, start, end); | 79 | nid, start, end); |
80 | printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); | 80 | printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); |
81 | printk(KERN_DEBUG " "); | 81 | printk(KERN_DEBUG " "); |
82 | for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { | 82 | for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { |
83 | physnode_map[pfn / PAGES_PER_ELEMENT] = nid; | 83 | physnode_map[pfn / PAGES_PER_ELEMENT] = nid; |
84 | printk("%ld ", pfn); | 84 | printk(KERN_CONT "%lx ", pfn); |
85 | } | 85 | } |
86 | printk("\n"); | 86 | printk(KERN_CONT "\n"); |
87 | } | 87 | } |
88 | 88 | ||
89 | unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, | 89 | unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, |
@@ -99,7 +99,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, | |||
99 | #endif | 99 | #endif |
100 | 100 | ||
101 | extern unsigned long find_max_low_pfn(void); | 101 | extern unsigned long find_max_low_pfn(void); |
102 | extern void add_one_highpage_init(struct page *, int, int); | ||
103 | extern unsigned long highend_pfn, highstart_pfn; | 102 | extern unsigned long highend_pfn, highstart_pfn; |
104 | 103 | ||
105 | #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) | 104 | #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) |
@@ -117,13 +116,13 @@ static unsigned long kva_pages; | |||
117 | */ | 116 | */ |
118 | int __init get_memcfg_numa_flat(void) | 117 | int __init get_memcfg_numa_flat(void) |
119 | { | 118 | { |
120 | printk("NUMA - single node, flat memory mode\n"); | 119 | printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); |
121 | 120 | ||
122 | /* Run the memory configuration and find the top of memory. */ | ||
123 | propagate_e820_map(); | ||
124 | node_start_pfn[0] = 0; | 121 | node_start_pfn[0] = 0; |
125 | node_end_pfn[0] = max_pfn; | 122 | node_end_pfn[0] = max_pfn; |
123 | e820_register_active_regions(0, 0, max_pfn); | ||
126 | memory_present(0, 0, max_pfn); | 124 | memory_present(0, 0, max_pfn); |
125 | node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); | ||
127 | 126 | ||
128 | /* Indicate there is one node available. */ | 127 | /* Indicate there is one node available. */ |
129 | nodes_clear(node_online_map); | 128 | nodes_clear(node_online_map); |
@@ -156,24 +155,32 @@ static void __init propagate_e820_map_node(int nid) | |||
156 | */ | 155 | */ |
157 | static void __init allocate_pgdat(int nid) | 156 | static void __init allocate_pgdat(int nid) |
158 | { | 157 | { |
159 | if (nid && node_has_online_mem(nid)) | 158 | char buf[16]; |
159 | |||
160 | if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) | ||
160 | NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; | 161 | NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; |
161 | else { | 162 | else { |
162 | NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); | 163 | unsigned long pgdat_phys; |
163 | min_low_pfn += PFN_UP(sizeof(pg_data_t)); | 164 | pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT, |
165 | max_pfn_mapped<<PAGE_SHIFT, | ||
166 | sizeof(pg_data_t), | ||
167 | PAGE_SIZE); | ||
168 | NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); | ||
169 | memset(buf, 0, sizeof(buf)); | ||
170 | sprintf(buf, "NODE_DATA %d", nid); | ||
171 | reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); | ||
164 | } | 172 | } |
173 | printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", | ||
174 | nid, (unsigned long)NODE_DATA(nid)); | ||
165 | } | 175 | } |
166 | 176 | ||
167 | #ifdef CONFIG_DISCONTIGMEM | ||
168 | /* | 177 | /* |
169 | * In the discontig memory model, a portion of the kernel virtual area (KVA) | 178 | * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel |
170 | * is reserved and portions of nodes are mapped using it. This is to allow | 179 | * virtual address space (KVA) is reserved and portions of nodes are mapped |
171 | * node-local memory to be allocated for structures that would normally require | 180 | * using it. This is to allow node-local memory to be allocated for |
172 | * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers | 181 | * structures that would normally require ZONE_NORMAL. The memory is |
173 | * should be prepared to allocate from the bootmem allocator instead. This KVA | 182 | * allocated with alloc_remap() and callers should be prepared to allocate |
174 | * mechanism is incompatible with SPARSEMEM as it makes assumptions about the | 183 | * from the bootmem allocator instead. |
175 | * layout of memory that are broken if alloc_remap() succeeds for some of the | ||
176 | * map and fails for others | ||
177 | */ | 184 | */ |
178 | static unsigned long node_remap_start_pfn[MAX_NUMNODES]; | 185 | static unsigned long node_remap_start_pfn[MAX_NUMNODES]; |
179 | static void *node_remap_end_vaddr[MAX_NUMNODES]; | 186 | static void *node_remap_end_vaddr[MAX_NUMNODES]; |
@@ -195,15 +202,19 @@ void *alloc_remap(int nid, unsigned long size) | |||
195 | return allocation; | 202 | return allocation; |
196 | } | 203 | } |
197 | 204 | ||
198 | void __init remap_numa_kva(void) | 205 | static void __init remap_numa_kva(void) |
199 | { | 206 | { |
200 | void *vaddr; | 207 | void *vaddr; |
201 | unsigned long pfn; | 208 | unsigned long pfn; |
202 | int node; | 209 | int node; |
203 | 210 | ||
204 | for_each_online_node(node) { | 211 | for_each_online_node(node) { |
212 | printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); | ||
205 | for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { | 213 | for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { |
206 | vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); | 214 | vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); |
215 | printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n", | ||
216 | (unsigned long)vaddr, | ||
217 | node_remap_start_pfn[node] + pfn); | ||
207 | set_pmd_pfn((ulong) vaddr, | 218 | set_pmd_pfn((ulong) vaddr, |
208 | node_remap_start_pfn[node] + pfn, | 219 | node_remap_start_pfn[node] + pfn, |
209 | PAGE_KERNEL_LARGE); | 220 | PAGE_KERNEL_LARGE); |
@@ -215,17 +226,21 @@ static unsigned long calculate_numa_remap_pages(void) | |||
215 | { | 226 | { |
216 | int nid; | 227 | int nid; |
217 | unsigned long size, reserve_pages = 0; | 228 | unsigned long size, reserve_pages = 0; |
218 | unsigned long pfn; | ||
219 | 229 | ||
220 | for_each_online_node(nid) { | 230 | for_each_online_node(nid) { |
221 | unsigned old_end_pfn = node_end_pfn[nid]; | 231 | u64 node_kva_target; |
232 | u64 node_kva_final; | ||
222 | 233 | ||
223 | /* | 234 | /* |
224 | * The acpi/srat node info can show hot-add memroy zones | 235 | * The acpi/srat node info can show hot-add memroy zones |
225 | * where memory could be added but not currently present. | 236 | * where memory could be added but not currently present. |
226 | */ | 237 | */ |
238 | printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", | ||
239 | nid, node_start_pfn[nid], node_end_pfn[nid]); | ||
227 | if (node_start_pfn[nid] > max_pfn) | 240 | if (node_start_pfn[nid] > max_pfn) |
228 | continue; | 241 | continue; |
242 | if (!node_end_pfn[nid]) | ||
243 | continue; | ||
229 | if (node_end_pfn[nid] > max_pfn) | 244 | if (node_end_pfn[nid] > max_pfn) |
230 | node_end_pfn[nid] = max_pfn; | 245 | node_end_pfn[nid] = max_pfn; |
231 | 246 | ||
@@ -237,41 +252,48 @@ static unsigned long calculate_numa_remap_pages(void) | |||
237 | /* now the roundup is correct, convert to PAGE_SIZE pages */ | 252 | /* now the roundup is correct, convert to PAGE_SIZE pages */ |
238 | size = size * PTRS_PER_PTE; | 253 | size = size * PTRS_PER_PTE; |
239 | 254 | ||
240 | /* | 255 | node_kva_target = round_down(node_end_pfn[nid] - size, |
241 | * Validate the region we are allocating only contains valid | 256 | PTRS_PER_PTE); |
242 | * pages. | 257 | node_kva_target <<= PAGE_SHIFT; |
243 | */ | 258 | do { |
244 | for (pfn = node_end_pfn[nid] - size; | 259 | node_kva_final = find_e820_area(node_kva_target, |
245 | pfn < node_end_pfn[nid]; pfn++) | 260 | ((u64)node_end_pfn[nid])<<PAGE_SHIFT, |
246 | if (!page_is_ram(pfn)) | 261 | ((u64)size)<<PAGE_SHIFT, |
247 | break; | 262 | LARGE_PAGE_BYTES); |
248 | 263 | node_kva_target -= LARGE_PAGE_BYTES; | |
249 | if (pfn != node_end_pfn[nid]) | 264 | } while (node_kva_final == -1ULL && |
250 | size = 0; | 265 | (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); |
266 | |||
267 | if (node_kva_final == -1ULL) | ||
268 | panic("Can not get kva ram\n"); | ||
251 | 269 | ||
252 | printk("Reserving %ld pages of KVA for lmem_map of node %d\n", | ||
253 | size, nid); | ||
254 | node_remap_size[nid] = size; | 270 | node_remap_size[nid] = size; |
255 | node_remap_offset[nid] = reserve_pages; | 271 | node_remap_offset[nid] = reserve_pages; |
256 | reserve_pages += size; | 272 | reserve_pages += size; |
257 | printk("Shrinking node %d from %ld pages to %ld pages\n", | 273 | printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" |
258 | nid, node_end_pfn[nid], node_end_pfn[nid] - size); | 274 | " node %d at %llx\n", |
259 | 275 | size, nid, node_kva_final>>PAGE_SHIFT); | |
260 | if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { | 276 | |
261 | /* | 277 | /* |
262 | * Align node_end_pfn[] and node_remap_start_pfn[] to | 278 | * prevent kva address below max_low_pfn want it on system |
263 | * pmd boundary. remap_numa_kva will barf otherwise. | 279 | * with less memory later. |
264 | */ | 280 | * layout will be: KVA address , KVA RAM |
265 | printk("Shrinking node %d further by %ld pages for proper alignment\n", | 281 | * |
266 | nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); | 282 | * we are supposed to only record the one less then max_low_pfn |
267 | size += node_end_pfn[nid] & (PTRS_PER_PTE-1); | 283 | * but we could have some hole in high memory, and it will only |
268 | } | 284 | * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide |
285 | * to use it as free. | ||
286 | * So reserve_early here, hope we don't run out of that array | ||
287 | */ | ||
288 | reserve_early(node_kva_final, | ||
289 | node_kva_final+(((u64)size)<<PAGE_SHIFT), | ||
290 | "KVA RAM"); | ||
269 | 291 | ||
270 | node_end_pfn[nid] -= size; | 292 | node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT; |
271 | node_remap_start_pfn[nid] = node_end_pfn[nid]; | 293 | remove_active_range(nid, node_remap_start_pfn[nid], |
272 | shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); | 294 | node_remap_start_pfn[nid] + size); |
273 | } | 295 | } |
274 | printk("Reserving total of %ld pages for numa KVA remap\n", | 296 | printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", |
275 | reserve_pages); | 297 | reserve_pages); |
276 | return reserve_pages; | 298 | return reserve_pages; |
277 | } | 299 | } |
@@ -285,37 +307,16 @@ static void init_remap_allocator(int nid) | |||
285 | node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + | 307 | node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + |
286 | ALIGN(sizeof(pg_data_t), PAGE_SIZE); | 308 | ALIGN(sizeof(pg_data_t), PAGE_SIZE); |
287 | 309 | ||
288 | printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, | 310 | printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, |
289 | (ulong) node_remap_start_vaddr[nid], | 311 | (ulong) node_remap_start_vaddr[nid], |
290 | (ulong) pfn_to_kaddr(highstart_pfn | 312 | (ulong) node_remap_end_vaddr[nid]); |
291 | + node_remap_offset[nid] + node_remap_size[nid])); | ||
292 | } | ||
293 | #else | ||
294 | void *alloc_remap(int nid, unsigned long size) | ||
295 | { | ||
296 | return NULL; | ||
297 | } | ||
298 | |||
299 | static unsigned long calculate_numa_remap_pages(void) | ||
300 | { | ||
301 | return 0; | ||
302 | } | ||
303 | |||
304 | static void init_remap_allocator(int nid) | ||
305 | { | ||
306 | } | ||
307 | |||
308 | void __init remap_numa_kva(void) | ||
309 | { | ||
310 | } | 313 | } |
311 | #endif /* CONFIG_DISCONTIGMEM */ | ||
312 | 314 | ||
313 | extern void setup_bootmem_allocator(void); | 315 | void __init initmem_init(unsigned long start_pfn, |
314 | unsigned long __init setup_memory(void) | 316 | unsigned long end_pfn) |
315 | { | 317 | { |
316 | int nid; | 318 | int nid; |
317 | unsigned long system_start_pfn, system_max_low_pfn; | 319 | long kva_target_pfn; |
318 | unsigned long wasted_pages; | ||
319 | 320 | ||
320 | /* | 321 | /* |
321 | * When mapping a NUMA machine we allocate the node_mem_map arrays | 322 | * When mapping a NUMA machine we allocate the node_mem_map arrays |
@@ -324,109 +325,77 @@ unsigned long __init setup_memory(void) | |||
324 | * this space and use it to adjust the boundary between ZONE_NORMAL | 325 | * this space and use it to adjust the boundary between ZONE_NORMAL |
325 | * and ZONE_HIGHMEM. | 326 | * and ZONE_HIGHMEM. |
326 | */ | 327 | */ |
327 | get_memcfg_numa(); | ||
328 | 328 | ||
329 | kva_pages = calculate_numa_remap_pages(); | 329 | get_memcfg_numa(); |
330 | 330 | ||
331 | /* partially used pages are not usable - thus round upwards */ | 331 | kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); |
332 | system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); | ||
333 | 332 | ||
334 | kva_start_pfn = find_max_low_pfn() - kva_pages; | 333 | kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); |
334 | do { | ||
335 | kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT, | ||
336 | max_low_pfn<<PAGE_SHIFT, | ||
337 | kva_pages<<PAGE_SHIFT, | ||
338 | PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; | ||
339 | kva_target_pfn -= PTRS_PER_PTE; | ||
340 | } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); | ||
335 | 341 | ||
336 | #ifdef CONFIG_BLK_DEV_INITRD | 342 | if (kva_start_pfn == -1UL) |
337 | /* Numa kva area is below the initrd */ | 343 | panic("Can not get kva space\n"); |
338 | if (initrd_start) | ||
339 | kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET) | ||
340 | - kva_pages; | ||
341 | #endif | ||
342 | 344 | ||
343 | /* | 345 | printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", |
344 | * We waste pages past at the end of the KVA for no good reason other | ||
345 | * than how it is located. This is bad. | ||
346 | */ | ||
347 | wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1); | ||
348 | kva_start_pfn -= wasted_pages; | ||
349 | kva_pages += wasted_pages; | ||
350 | |||
351 | system_max_low_pfn = max_low_pfn = find_max_low_pfn(); | ||
352 | printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", | ||
353 | kva_start_pfn, max_low_pfn); | 346 | kva_start_pfn, max_low_pfn); |
354 | printk("max_pfn = %ld\n", max_pfn); | 347 | printk(KERN_INFO "max_pfn = %lx\n", max_pfn); |
348 | |||
349 | /* avoid clash with initrd */ | ||
350 | reserve_early(kva_start_pfn<<PAGE_SHIFT, | ||
351 | (kva_start_pfn + kva_pages)<<PAGE_SHIFT, | ||
352 | "KVA PG"); | ||
355 | #ifdef CONFIG_HIGHMEM | 353 | #ifdef CONFIG_HIGHMEM |
356 | highstart_pfn = highend_pfn = max_pfn; | 354 | highstart_pfn = highend_pfn = max_pfn; |
357 | if (max_pfn > system_max_low_pfn) | 355 | if (max_pfn > max_low_pfn) |
358 | highstart_pfn = system_max_low_pfn; | 356 | highstart_pfn = max_low_pfn; |
359 | printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", | 357 | printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", |
360 | pages_to_mb(highend_pfn - highstart_pfn)); | 358 | pages_to_mb(highend_pfn - highstart_pfn)); |
361 | num_physpages = highend_pfn; | 359 | num_physpages = highend_pfn; |
362 | high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; | 360 | high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; |
363 | #else | 361 | #else |
364 | num_physpages = system_max_low_pfn; | 362 | num_physpages = max_low_pfn; |
365 | high_memory = (void *) __va(system_max_low_pfn * PAGE_SIZE - 1) + 1; | 363 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; |
366 | #endif | 364 | #endif |
367 | printk(KERN_NOTICE "%ldMB LOWMEM available.\n", | 365 | printk(KERN_NOTICE "%ldMB LOWMEM available.\n", |
368 | pages_to_mb(system_max_low_pfn)); | 366 | pages_to_mb(max_low_pfn)); |
369 | printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n", | 367 | printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n", |
370 | min_low_pfn, max_low_pfn, highstart_pfn); | 368 | max_low_pfn, highstart_pfn); |
371 | 369 | ||
372 | printk("Low memory ends at vaddr %08lx\n", | 370 | printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", |
373 | (ulong) pfn_to_kaddr(max_low_pfn)); | 371 | (ulong) pfn_to_kaddr(max_low_pfn)); |
374 | for_each_online_node(nid) { | 372 | for_each_online_node(nid) { |
375 | init_remap_allocator(nid); | 373 | init_remap_allocator(nid); |
376 | 374 | ||
377 | allocate_pgdat(nid); | 375 | allocate_pgdat(nid); |
378 | } | 376 | } |
379 | printk("High memory starts at vaddr %08lx\n", | 377 | remap_numa_kva(); |
378 | |||
379 | printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", | ||
380 | (ulong) pfn_to_kaddr(highstart_pfn)); | 380 | (ulong) pfn_to_kaddr(highstart_pfn)); |
381 | for_each_online_node(nid) | 381 | for_each_online_node(nid) |
382 | propagate_e820_map_node(nid); | 382 | propagate_e820_map_node(nid); |
383 | 383 | ||
384 | memset(NODE_DATA(0), 0, sizeof(struct pglist_data)); | 384 | for_each_online_node(nid) |
385 | NODE_DATA(0)->bdata = &node0_bdata; | 385 | memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); |
386 | setup_bootmem_allocator(); | ||
387 | return max_low_pfn; | ||
388 | } | ||
389 | |||
390 | void __init numa_kva_reserve(void) | ||
391 | { | ||
392 | if (kva_pages) | ||
393 | reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages), | ||
394 | BOOTMEM_DEFAULT); | ||
395 | } | ||
396 | |||
397 | void __init zone_sizes_init(void) | ||
398 | { | ||
399 | int nid; | ||
400 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | ||
401 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | ||
402 | max_zone_pfns[ZONE_DMA] = | ||
403 | virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; | ||
404 | max_zone_pfns[ZONE_NORMAL] = max_low_pfn; | ||
405 | #ifdef CONFIG_HIGHMEM | ||
406 | max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; | ||
407 | #endif | ||
408 | |||
409 | /* If SRAT has not registered memory, register it now */ | ||
410 | if (find_max_pfn_with_active_regions() == 0) { | ||
411 | for_each_online_node(nid) { | ||
412 | if (node_has_online_mem(nid)) | ||
413 | add_active_range(nid, node_start_pfn[nid], | ||
414 | node_end_pfn[nid]); | ||
415 | } | ||
416 | } | ||
417 | 386 | ||
418 | free_area_init_nodes(max_zone_pfns); | 387 | NODE_DATA(0)->bdata = &bootmem_node_data[0]; |
419 | return; | 388 | setup_bootmem_allocator(); |
420 | } | 389 | } |
421 | 390 | ||
422 | void __init set_highmem_pages_init(int bad_ppro) | 391 | void __init set_highmem_pages_init(void) |
423 | { | 392 | { |
424 | #ifdef CONFIG_HIGHMEM | 393 | #ifdef CONFIG_HIGHMEM |
425 | struct zone *zone; | 394 | struct zone *zone; |
426 | struct page *page; | 395 | int nid; |
427 | 396 | ||
428 | for_each_zone(zone) { | 397 | for_each_zone(zone) { |
429 | unsigned long node_pfn, zone_start_pfn, zone_end_pfn; | 398 | unsigned long zone_start_pfn, zone_end_pfn; |
430 | 399 | ||
431 | if (!is_highmem(zone)) | 400 | if (!is_highmem(zone)) |
432 | continue; | 401 | continue; |
@@ -434,16 +403,12 @@ void __init set_highmem_pages_init(int bad_ppro) | |||
434 | zone_start_pfn = zone->zone_start_pfn; | 403 | zone_start_pfn = zone->zone_start_pfn; |
435 | zone_end_pfn = zone_start_pfn + zone->spanned_pages; | 404 | zone_end_pfn = zone_start_pfn + zone->spanned_pages; |
436 | 405 | ||
437 | printk("Initializing %s for node %d (%08lx:%08lx)\n", | 406 | nid = zone_to_nid(zone); |
438 | zone->name, zone_to_nid(zone), | 407 | printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", |
439 | zone_start_pfn, zone_end_pfn); | 408 | zone->name, nid, zone_start_pfn, zone_end_pfn); |
440 | 409 | ||
441 | for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { | 410 | add_highpages_with_active_regions(nid, zone_start_pfn, |
442 | if (!pfn_valid(node_pfn)) | 411 | zone_end_pfn); |
443 | continue; | ||
444 | page = pfn_to_page(node_pfn); | ||
445 | add_one_highpage_init(page, node_pfn, bad_ppro); | ||
446 | } | ||
447 | } | 412 | } |
448 | totalram_pages += totalhigh_pages; | 413 | totalram_pages += totalhigh_pages; |
449 | #endif | 414 | #endif |
@@ -476,3 +441,4 @@ int memory_add_physaddr_to_nid(u64 addr) | |||
476 | 441 | ||
477 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | 442 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); |
478 | #endif | 443 | #endif |
444 | |||
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 2c24bea92c66..e7277cbcfb40 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c | |||
@@ -42,7 +42,7 @@ static struct addr_marker address_markers[] = { | |||
42 | { 0, "User Space" }, | 42 | { 0, "User Space" }, |
43 | #ifdef CONFIG_X86_64 | 43 | #ifdef CONFIG_X86_64 |
44 | { 0x8000000000000000UL, "Kernel Space" }, | 44 | { 0x8000000000000000UL, "Kernel Space" }, |
45 | { 0xffff810000000000UL, "Low Kernel Mapping" }, | 45 | { PAGE_OFFSET, "Low Kernel Mapping" }, |
46 | { VMALLOC_START, "vmalloc() Area" }, | 46 | { VMALLOC_START, "vmalloc() Area" }, |
47 | { VMEMMAP_START, "Vmemmap" }, | 47 | { VMEMMAP_START, "Vmemmap" }, |
48 | { __START_KERNEL_map, "High Kernel Mapping" }, | 48 | { __START_KERNEL_map, "High Kernel Mapping" }, |
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, | |||
148 | * we have now. "break" is either changing perms, levels or | 148 | * we have now. "break" is either changing perms, levels or |
149 | * address space marker. | 149 | * address space marker. |
150 | */ | 150 | */ |
151 | prot = pgprot_val(new_prot) & ~(PTE_MASK); | 151 | prot = pgprot_val(new_prot) & PTE_FLAGS_MASK; |
152 | cur = pgprot_val(st->current_prot) & ~(PTE_MASK); | 152 | cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK; |
153 | 153 | ||
154 | if (!st->level) { | 154 | if (!st->level) { |
155 | /* First entry */ | 155 | /* First entry */ |
@@ -221,7 +221,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, | |||
221 | for (i = 0; i < PTRS_PER_PMD; i++) { | 221 | for (i = 0; i < PTRS_PER_PMD; i++) { |
222 | st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); | 222 | st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); |
223 | if (!pmd_none(*start)) { | 223 | if (!pmd_none(*start)) { |
224 | pgprotval_t prot = pmd_val(*start) & ~PTE_MASK; | 224 | pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK; |
225 | 225 | ||
226 | if (pmd_large(*start) || !pmd_present(*start)) | 226 | if (pmd_large(*start) || !pmd_present(*start)) |
227 | note_page(m, st, __pgprot(prot), 3); | 227 | note_page(m, st, __pgprot(prot), 3); |
@@ -253,7 +253,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, | |||
253 | for (i = 0; i < PTRS_PER_PUD; i++) { | 253 | for (i = 0; i < PTRS_PER_PUD; i++) { |
254 | st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); | 254 | st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); |
255 | if (!pud_none(*start)) { | 255 | if (!pud_none(*start)) { |
256 | pgprotval_t prot = pud_val(*start) & ~PTE_MASK; | 256 | pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK; |
257 | 257 | ||
258 | if (pud_large(*start) || !pud_present(*start)) | 258 | if (pud_large(*start) || !pud_present(*start)) |
259 | note_page(m, st, __pgprot(prot), 2); | 259 | note_page(m, st, __pgprot(prot), 2); |
@@ -288,7 +288,7 @@ static void walk_pgd_level(struct seq_file *m) | |||
288 | for (i = 0; i < PTRS_PER_PGD; i++) { | 288 | for (i = 0; i < PTRS_PER_PGD; i++) { |
289 | st.current_address = normalize_addr(i * PGD_LEVEL_MULT); | 289 | st.current_address = normalize_addr(i * PGD_LEVEL_MULT); |
290 | if (!pgd_none(*start)) { | 290 | if (!pgd_none(*start)) { |
291 | pgprotval_t prot = pgd_val(*start) & ~PTE_MASK; | 291 | pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK; |
292 | 292 | ||
293 | if (pgd_large(*start) || !pgd_present(*start)) | 293 | if (pgd_large(*start) || !pgd_present(*start)) |
294 | note_page(m, &st, __pgprot(prot), 1); | 294 | note_page(m, &st, __pgprot(prot), 1); |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8bcb6f40ccb6..a742d753d5b0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/string.h> | 10 | #include <linux/string.h> |
11 | #include <linux/types.h> | 11 | #include <linux/types.h> |
12 | #include <linux/ptrace.h> | 12 | #include <linux/ptrace.h> |
13 | #include <linux/mmiotrace.h> | ||
13 | #include <linux/mman.h> | 14 | #include <linux/mman.h> |
14 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
15 | #include <linux/smp.h> | 16 | #include <linux/smp.h> |
@@ -34,6 +35,7 @@ | |||
34 | #include <asm/tlbflush.h> | 35 | #include <asm/tlbflush.h> |
35 | #include <asm/proto.h> | 36 | #include <asm/proto.h> |
36 | #include <asm-generic/sections.h> | 37 | #include <asm-generic/sections.h> |
38 | #include <asm/traps.h> | ||
37 | 39 | ||
38 | /* | 40 | /* |
39 | * Page fault error code bits | 41 | * Page fault error code bits |
@@ -49,17 +51,23 @@ | |||
49 | #define PF_RSVD (1<<3) | 51 | #define PF_RSVD (1<<3) |
50 | #define PF_INSTR (1<<4) | 52 | #define PF_INSTR (1<<4) |
51 | 53 | ||
54 | static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) | ||
55 | { | ||
56 | #ifdef CONFIG_MMIOTRACE_HOOKS | ||
57 | if (unlikely(is_kmmio_active())) | ||
58 | if (kmmio_handler(regs, addr) == 1) | ||
59 | return -1; | ||
60 | #endif | ||
61 | return 0; | ||
62 | } | ||
63 | |||
52 | static inline int notify_page_fault(struct pt_regs *regs) | 64 | static inline int notify_page_fault(struct pt_regs *regs) |
53 | { | 65 | { |
54 | #ifdef CONFIG_KPROBES | 66 | #ifdef CONFIG_KPROBES |
55 | int ret = 0; | 67 | int ret = 0; |
56 | 68 | ||
57 | /* kprobe_running() needs smp_processor_id() */ | 69 | /* kprobe_running() needs smp_processor_id() */ |
58 | #ifdef CONFIG_X86_32 | ||
59 | if (!user_mode_vm(regs)) { | 70 | if (!user_mode_vm(regs)) { |
60 | #else | ||
61 | if (!user_mode(regs)) { | ||
62 | #endif | ||
63 | preempt_disable(); | 71 | preempt_disable(); |
64 | if (kprobe_running() && kprobe_fault_handler(regs, 14)) | 72 | if (kprobe_running() && kprobe_fault_handler(regs, 14)) |
65 | ret = 1; | 73 | ret = 1; |
@@ -350,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address) | |||
350 | return 0; | 358 | return 0; |
351 | } | 359 | } |
352 | 360 | ||
353 | void do_invalid_op(struct pt_regs *, unsigned long); | ||
354 | |||
355 | static int is_f00f_bug(struct pt_regs *regs, unsigned long address) | 361 | static int is_f00f_bug(struct pt_regs *regs, unsigned long address) |
356 | { | 362 | { |
357 | #ifdef CONFIG_X86_F00F_BUG | 363 | #ifdef CONFIG_X86_F00F_BUG |
@@ -396,11 +402,7 @@ static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, | |||
396 | printk(KERN_CONT "NULL pointer dereference"); | 402 | printk(KERN_CONT "NULL pointer dereference"); |
397 | else | 403 | else |
398 | printk(KERN_CONT "paging request"); | 404 | printk(KERN_CONT "paging request"); |
399 | #ifdef CONFIG_X86_32 | 405 | printk(KERN_CONT " at %p\n", (void *) address); |
400 | printk(KERN_CONT " at %08lx\n", address); | ||
401 | #else | ||
402 | printk(KERN_CONT " at %016lx\n", address); | ||
403 | #endif | ||
404 | printk(KERN_ALERT "IP:"); | 406 | printk(KERN_ALERT "IP:"); |
405 | printk_address(regs->ip, 1); | 407 | printk_address(regs->ip, 1); |
406 | dump_pagetable(address); | 408 | dump_pagetable(address); |
@@ -606,6 +608,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
606 | 608 | ||
607 | if (notify_page_fault(regs)) | 609 | if (notify_page_fault(regs)) |
608 | return; | 610 | return; |
611 | if (unlikely(kmmio_fault(regs, address))) | ||
612 | return; | ||
609 | 613 | ||
610 | /* | 614 | /* |
611 | * We fault-in kernel-space virtual memory on-demand. The | 615 | * We fault-in kernel-space virtual memory on-demand. The |
@@ -800,14 +804,10 @@ bad_area_nosemaphore: | |||
800 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && | 804 | if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && |
801 | printk_ratelimit()) { | 805 | printk_ratelimit()) { |
802 | printk( | 806 | printk( |
803 | #ifdef CONFIG_X86_32 | 807 | "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", |
804 | "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx", | ||
805 | #else | ||
806 | "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx", | ||
807 | #endif | ||
808 | task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, | 808 | task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, |
809 | tsk->comm, task_pid_nr(tsk), address, regs->ip, | 809 | tsk->comm, task_pid_nr(tsk), address, |
810 | regs->sp, error_code); | 810 | (void *) regs->ip, (void *) regs->sp, error_code); |
811 | print_vma_addr(" in ", regs->ip); | 811 | print_vma_addr(" in ", regs->ip); |
812 | printk("\n"); | 812 | printk("\n"); |
813 | } | 813 | } |
@@ -914,72 +914,45 @@ LIST_HEAD(pgd_list); | |||
914 | 914 | ||
915 | void vmalloc_sync_all(void) | 915 | void vmalloc_sync_all(void) |
916 | { | 916 | { |
917 | #ifdef CONFIG_X86_32 | ||
918 | /* | ||
919 | * Note that races in the updates of insync and start aren't | ||
920 | * problematic: insync can only get set bits added, and updates to | ||
921 | * start are only improving performance (without affecting correctness | ||
922 | * if undone). | ||
923 | */ | ||
924 | static DECLARE_BITMAP(insync, PTRS_PER_PGD); | ||
925 | static unsigned long start = TASK_SIZE; | ||
926 | unsigned long address; | 917 | unsigned long address; |
927 | 918 | ||
919 | #ifdef CONFIG_X86_32 | ||
928 | if (SHARED_KERNEL_PMD) | 920 | if (SHARED_KERNEL_PMD) |
929 | return; | 921 | return; |
930 | 922 | ||
931 | BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); | 923 | for (address = VMALLOC_START & PMD_MASK; |
932 | for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { | 924 | address >= TASK_SIZE && address < FIXADDR_TOP; |
933 | if (!test_bit(pgd_index(address), insync)) { | 925 | address += PMD_SIZE) { |
934 | unsigned long flags; | 926 | unsigned long flags; |
935 | struct page *page; | 927 | struct page *page; |
936 | 928 | ||
937 | spin_lock_irqsave(&pgd_lock, flags); | 929 | spin_lock_irqsave(&pgd_lock, flags); |
938 | list_for_each_entry(page, &pgd_list, lru) { | 930 | list_for_each_entry(page, &pgd_list, lru) { |
939 | if (!vmalloc_sync_one(page_address(page), | 931 | if (!vmalloc_sync_one(page_address(page), |
940 | address)) | 932 | address)) |
941 | break; | 933 | break; |
942 | } | ||
943 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
944 | if (!page) | ||
945 | set_bit(pgd_index(address), insync); | ||
946 | } | 934 | } |
947 | if (address == start && test_bit(pgd_index(address), insync)) | 935 | spin_unlock_irqrestore(&pgd_lock, flags); |
948 | start = address + PGDIR_SIZE; | ||
949 | } | 936 | } |
950 | #else /* CONFIG_X86_64 */ | 937 | #else /* CONFIG_X86_64 */ |
951 | /* | 938 | for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; |
952 | * Note that races in the updates of insync and start aren't | 939 | address += PGDIR_SIZE) { |
953 | * problematic: insync can only get set bits added, and updates to | 940 | const pgd_t *pgd_ref = pgd_offset_k(address); |
954 | * start are only improving performance (without affecting correctness | 941 | unsigned long flags; |
955 | * if undone). | 942 | struct page *page; |
956 | */ | 943 | |
957 | static DECLARE_BITMAP(insync, PTRS_PER_PGD); | 944 | if (pgd_none(*pgd_ref)) |
958 | static unsigned long start = VMALLOC_START & PGDIR_MASK; | 945 | continue; |
959 | unsigned long address; | 946 | spin_lock_irqsave(&pgd_lock, flags); |
960 | 947 | list_for_each_entry(page, &pgd_list, lru) { | |
961 | for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { | 948 | pgd_t *pgd; |
962 | if (!test_bit(pgd_index(address), insync)) { | 949 | pgd = (pgd_t *)page_address(page) + pgd_index(address); |
963 | const pgd_t *pgd_ref = pgd_offset_k(address); | 950 | if (pgd_none(*pgd)) |
964 | unsigned long flags; | 951 | set_pgd(pgd, *pgd_ref); |
965 | struct page *page; | 952 | else |
966 | 953 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | |
967 | if (pgd_none(*pgd_ref)) | ||
968 | continue; | ||
969 | spin_lock_irqsave(&pgd_lock, flags); | ||
970 | list_for_each_entry(page, &pgd_list, lru) { | ||
971 | pgd_t *pgd; | ||
972 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | ||
973 | if (pgd_none(*pgd)) | ||
974 | set_pgd(pgd, *pgd_ref); | ||
975 | else | ||
976 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | ||
977 | } | ||
978 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
979 | set_bit(pgd_index(address), insync); | ||
980 | } | 954 | } |
981 | if (address == start) | 955 | spin_unlock_irqrestore(&pgd_lock, flags); |
982 | start = address + PGDIR_SIZE; | ||
983 | } | 956 | } |
984 | #endif | 957 | #endif |
985 | } | 958 | } |
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c new file mode 100644 index 000000000000..007bb06c7504 --- /dev/null +++ b/arch/x86/mm/gup.c | |||
@@ -0,0 +1,298 @@ | |||
1 | /* | ||
2 | * Lockless get_user_pages_fast for x86 | ||
3 | * | ||
4 | * Copyright (C) 2008 Nick Piggin | ||
5 | * Copyright (C) 2008 Novell Inc. | ||
6 | */ | ||
7 | #include <linux/sched.h> | ||
8 | #include <linux/mm.h> | ||
9 | #include <linux/vmstat.h> | ||
10 | #include <linux/highmem.h> | ||
11 | |||
12 | #include <asm/pgtable.h> | ||
13 | |||
14 | static inline pte_t gup_get_pte(pte_t *ptep) | ||
15 | { | ||
16 | #ifndef CONFIG_X86_PAE | ||
17 | return *ptep; | ||
18 | #else | ||
19 | /* | ||
20 | * With get_user_pages_fast, we walk down the pagetables without taking | ||
21 | * any locks. For this we would like to load the pointers atoimcally, | ||
22 | * but that is not possible (without expensive cmpxchg8b) on PAE. What | ||
23 | * we do have is the guarantee that a pte will only either go from not | ||
24 | * present to present, or present to not present or both -- it will not | ||
25 | * switch to a completely different present page without a TLB flush in | ||
26 | * between; something that we are blocking by holding interrupts off. | ||
27 | * | ||
28 | * Setting ptes from not present to present goes: | ||
29 | * ptep->pte_high = h; | ||
30 | * smp_wmb(); | ||
31 | * ptep->pte_low = l; | ||
32 | * | ||
33 | * And present to not present goes: | ||
34 | * ptep->pte_low = 0; | ||
35 | * smp_wmb(); | ||
36 | * ptep->pte_high = 0; | ||
37 | * | ||
38 | * We must ensure here that the load of pte_low sees l iff pte_high | ||
39 | * sees h. We load pte_high *after* loading pte_low, which ensures we | ||
40 | * don't see an older value of pte_high. *Then* we recheck pte_low, | ||
41 | * which ensures that we haven't picked up a changed pte high. We might | ||
42 | * have got rubbish values from pte_low and pte_high, but we are | ||
43 | * guaranteed that pte_low will not have the present bit set *unless* | ||
44 | * it is 'l'. And get_user_pages_fast only operates on present ptes, so | ||
45 | * we're safe. | ||
46 | * | ||
47 | * gup_get_pte should not be used or copied outside gup.c without being | ||
48 | * very careful -- it does not atomically load the pte or anything that | ||
49 | * is likely to be useful for you. | ||
50 | */ | ||
51 | pte_t pte; | ||
52 | |||
53 | retry: | ||
54 | pte.pte_low = ptep->pte_low; | ||
55 | smp_rmb(); | ||
56 | pte.pte_high = ptep->pte_high; | ||
57 | smp_rmb(); | ||
58 | if (unlikely(pte.pte_low != ptep->pte_low)) | ||
59 | goto retry; | ||
60 | |||
61 | return pte; | ||
62 | #endif | ||
63 | } | ||
64 | |||
65 | /* | ||
66 | * The performance critical leaf functions are made noinline otherwise gcc | ||
67 | * inlines everything into a single function which results in too much | ||
68 | * register pressure. | ||
69 | */ | ||
70 | static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | ||
71 | unsigned long end, int write, struct page **pages, int *nr) | ||
72 | { | ||
73 | unsigned long mask; | ||
74 | pte_t *ptep; | ||
75 | |||
76 | mask = _PAGE_PRESENT|_PAGE_USER; | ||
77 | if (write) | ||
78 | mask |= _PAGE_RW; | ||
79 | |||
80 | ptep = pte_offset_map(&pmd, addr); | ||
81 | do { | ||
82 | pte_t pte = gup_get_pte(ptep); | ||
83 | struct page *page; | ||
84 | |||
85 | if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) { | ||
86 | pte_unmap(ptep); | ||
87 | return 0; | ||
88 | } | ||
89 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
90 | page = pte_page(pte); | ||
91 | get_page(page); | ||
92 | pages[*nr] = page; | ||
93 | (*nr)++; | ||
94 | |||
95 | } while (ptep++, addr += PAGE_SIZE, addr != end); | ||
96 | pte_unmap(ptep - 1); | ||
97 | |||
98 | return 1; | ||
99 | } | ||
100 | |||
101 | static inline void get_head_page_multiple(struct page *page, int nr) | ||
102 | { | ||
103 | VM_BUG_ON(page != compound_head(page)); | ||
104 | VM_BUG_ON(page_count(page) == 0); | ||
105 | atomic_add(nr, &page->_count); | ||
106 | } | ||
107 | |||
108 | static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | ||
109 | unsigned long end, int write, struct page **pages, int *nr) | ||
110 | { | ||
111 | unsigned long mask; | ||
112 | pte_t pte = *(pte_t *)&pmd; | ||
113 | struct page *head, *page; | ||
114 | int refs; | ||
115 | |||
116 | mask = _PAGE_PRESENT|_PAGE_USER; | ||
117 | if (write) | ||
118 | mask |= _PAGE_RW; | ||
119 | if ((pte_val(pte) & mask) != mask) | ||
120 | return 0; | ||
121 | /* hugepages are never "special" */ | ||
122 | VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); | ||
123 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
124 | |||
125 | refs = 0; | ||
126 | head = pte_page(pte); | ||
127 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
128 | do { | ||
129 | VM_BUG_ON(compound_head(page) != head); | ||
130 | pages[*nr] = page; | ||
131 | (*nr)++; | ||
132 | page++; | ||
133 | refs++; | ||
134 | } while (addr += PAGE_SIZE, addr != end); | ||
135 | get_head_page_multiple(head, refs); | ||
136 | |||
137 | return 1; | ||
138 | } | ||
139 | |||
140 | static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | ||
141 | int write, struct page **pages, int *nr) | ||
142 | { | ||
143 | unsigned long next; | ||
144 | pmd_t *pmdp; | ||
145 | |||
146 | pmdp = pmd_offset(&pud, addr); | ||
147 | do { | ||
148 | pmd_t pmd = *pmdp; | ||
149 | |||
150 | next = pmd_addr_end(addr, end); | ||
151 | if (pmd_none(pmd)) | ||
152 | return 0; | ||
153 | if (unlikely(pmd_large(pmd))) { | ||
154 | if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) | ||
155 | return 0; | ||
156 | } else { | ||
157 | if (!gup_pte_range(pmd, addr, next, write, pages, nr)) | ||
158 | return 0; | ||
159 | } | ||
160 | } while (pmdp++, addr = next, addr != end); | ||
161 | |||
162 | return 1; | ||
163 | } | ||
164 | |||
165 | static noinline int gup_huge_pud(pud_t pud, unsigned long addr, | ||
166 | unsigned long end, int write, struct page **pages, int *nr) | ||
167 | { | ||
168 | unsigned long mask; | ||
169 | pte_t pte = *(pte_t *)&pud; | ||
170 | struct page *head, *page; | ||
171 | int refs; | ||
172 | |||
173 | mask = _PAGE_PRESENT|_PAGE_USER; | ||
174 | if (write) | ||
175 | mask |= _PAGE_RW; | ||
176 | if ((pte_val(pte) & mask) != mask) | ||
177 | return 0; | ||
178 | /* hugepages are never "special" */ | ||
179 | VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); | ||
180 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
181 | |||
182 | refs = 0; | ||
183 | head = pte_page(pte); | ||
184 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | ||
185 | do { | ||
186 | VM_BUG_ON(compound_head(page) != head); | ||
187 | pages[*nr] = page; | ||
188 | (*nr)++; | ||
189 | page++; | ||
190 | refs++; | ||
191 | } while (addr += PAGE_SIZE, addr != end); | ||
192 | get_head_page_multiple(head, refs); | ||
193 | |||
194 | return 1; | ||
195 | } | ||
196 | |||
197 | static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, | ||
198 | int write, struct page **pages, int *nr) | ||
199 | { | ||
200 | unsigned long next; | ||
201 | pud_t *pudp; | ||
202 | |||
203 | pudp = pud_offset(&pgd, addr); | ||
204 | do { | ||
205 | pud_t pud = *pudp; | ||
206 | |||
207 | next = pud_addr_end(addr, end); | ||
208 | if (pud_none(pud)) | ||
209 | return 0; | ||
210 | if (unlikely(pud_large(pud))) { | ||
211 | if (!gup_huge_pud(pud, addr, next, write, pages, nr)) | ||
212 | return 0; | ||
213 | } else { | ||
214 | if (!gup_pmd_range(pud, addr, next, write, pages, nr)) | ||
215 | return 0; | ||
216 | } | ||
217 | } while (pudp++, addr = next, addr != end); | ||
218 | |||
219 | return 1; | ||
220 | } | ||
221 | |||
222 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
223 | struct page **pages) | ||
224 | { | ||
225 | struct mm_struct *mm = current->mm; | ||
226 | unsigned long addr, len, end; | ||
227 | unsigned long next; | ||
228 | pgd_t *pgdp; | ||
229 | int nr = 0; | ||
230 | |||
231 | start &= PAGE_MASK; | ||
232 | addr = start; | ||
233 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
234 | end = start + len; | ||
235 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | ||
236 | start, len))) | ||
237 | goto slow_irqon; | ||
238 | |||
239 | /* | ||
240 | * XXX: batch / limit 'nr', to avoid large irq off latency | ||
241 | * needs some instrumenting to determine the common sizes used by | ||
242 | * important workloads (eg. DB2), and whether limiting the batch size | ||
243 | * will decrease performance. | ||
244 | * | ||
245 | * It seems like we're in the clear for the moment. Direct-IO is | ||
246 | * the main guy that batches up lots of get_user_pages, and even | ||
247 | * they are limited to 64-at-a-time which is not so many. | ||
248 | */ | ||
249 | /* | ||
250 | * This doesn't prevent pagetable teardown, but does prevent | ||
251 | * the pagetables and pages from being freed on x86. | ||
252 | * | ||
253 | * So long as we atomically load page table pointers versus teardown | ||
254 | * (which we do on x86, with the above PAE exception), we can follow the | ||
255 | * address down to the the page and take a ref on it. | ||
256 | */ | ||
257 | local_irq_disable(); | ||
258 | pgdp = pgd_offset(mm, addr); | ||
259 | do { | ||
260 | pgd_t pgd = *pgdp; | ||
261 | |||
262 | next = pgd_addr_end(addr, end); | ||
263 | if (pgd_none(pgd)) | ||
264 | goto slow; | ||
265 | if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) | ||
266 | goto slow; | ||
267 | } while (pgdp++, addr = next, addr != end); | ||
268 | local_irq_enable(); | ||
269 | |||
270 | VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); | ||
271 | return nr; | ||
272 | |||
273 | { | ||
274 | int ret; | ||
275 | |||
276 | slow: | ||
277 | local_irq_enable(); | ||
278 | slow_irqon: | ||
279 | /* Try to get the remaining pages with get_user_pages */ | ||
280 | start += nr << PAGE_SHIFT; | ||
281 | pages += nr; | ||
282 | |||
283 | down_read(&mm->mmap_sem); | ||
284 | ret = get_user_pages(current, mm, start, | ||
285 | (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); | ||
286 | up_read(&mm->mmap_sem); | ||
287 | |||
288 | /* Have to be a bit careful with return values */ | ||
289 | if (nr > 0) { | ||
290 | if (ret < 0) | ||
291 | ret = nr; | ||
292 | else | ||
293 | ret += nr; | ||
294 | } | ||
295 | |||
296 | return ret; | ||
297 | } | ||
298 | } | ||
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 0b3d567e686d..8f307d914c2e 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c | |||
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | |||
124 | return 1; | 124 | return 1; |
125 | } | 125 | } |
126 | 126 | ||
127 | pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) | 127 | pte_t *huge_pte_alloc(struct mm_struct *mm, |
128 | unsigned long addr, unsigned long sz) | ||
128 | { | 129 | { |
129 | pgd_t *pgd; | 130 | pgd_t *pgd; |
130 | pud_t *pud; | 131 | pud_t *pud; |
@@ -133,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) | |||
133 | pgd = pgd_offset(mm, addr); | 134 | pgd = pgd_offset(mm, addr); |
134 | pud = pud_alloc(mm, pgd, addr); | 135 | pud = pud_alloc(mm, pgd, addr); |
135 | if (pud) { | 136 | if (pud) { |
136 | if (pud_none(*pud)) | 137 | if (sz == PUD_SIZE) { |
137 | huge_pmd_share(mm, addr, pud); | 138 | pte = (pte_t *)pud; |
138 | pte = (pte_t *) pmd_alloc(mm, pud, addr); | 139 | } else { |
140 | BUG_ON(sz != PMD_SIZE); | ||
141 | if (pud_none(*pud)) | ||
142 | huge_pmd_share(mm, addr, pud); | ||
143 | pte = (pte_t *) pmd_alloc(mm, pud, addr); | ||
144 | } | ||
139 | } | 145 | } |
140 | BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); | 146 | BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); |
141 | 147 | ||
@@ -151,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
151 | pgd = pgd_offset(mm, addr); | 157 | pgd = pgd_offset(mm, addr); |
152 | if (pgd_present(*pgd)) { | 158 | if (pgd_present(*pgd)) { |
153 | pud = pud_offset(pgd, addr); | 159 | pud = pud_offset(pgd, addr); |
154 | if (pud_present(*pud)) | 160 | if (pud_present(*pud)) { |
161 | if (pud_large(*pud)) | ||
162 | return (pte_t *)pud; | ||
155 | pmd = pmd_offset(pud, addr); | 163 | pmd = pmd_offset(pud, addr); |
164 | } | ||
156 | } | 165 | } |
157 | return (pte_t *) pmd; | 166 | return (pte_t *) pmd; |
158 | } | 167 | } |
@@ -188,6 +197,11 @@ int pmd_huge(pmd_t pmd) | |||
188 | return 0; | 197 | return 0; |
189 | } | 198 | } |
190 | 199 | ||
200 | int pud_huge(pud_t pud) | ||
201 | { | ||
202 | return 0; | ||
203 | } | ||
204 | |||
191 | struct page * | 205 | struct page * |
192 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | 206 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
193 | pmd_t *pmd, int write) | 207 | pmd_t *pmd, int write) |
@@ -208,6 +222,11 @@ int pmd_huge(pmd_t pmd) | |||
208 | return !!(pmd_val(pmd) & _PAGE_PSE); | 222 | return !!(pmd_val(pmd) & _PAGE_PSE); |
209 | } | 223 | } |
210 | 224 | ||
225 | int pud_huge(pud_t pud) | ||
226 | { | ||
227 | return !!(pud_val(pud) & _PAGE_PSE); | ||
228 | } | ||
229 | |||
211 | struct page * | 230 | struct page * |
212 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | 231 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
213 | pmd_t *pmd, int write) | 232 | pmd_t *pmd, int write) |
@@ -216,9 +235,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |||
216 | 235 | ||
217 | page = pte_page(*(pte_t *)pmd); | 236 | page = pte_page(*(pte_t *)pmd); |
218 | if (page) | 237 | if (page) |
219 | page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); | 238 | page += ((address & ~PMD_MASK) >> PAGE_SHIFT); |
239 | return page; | ||
240 | } | ||
241 | |||
242 | struct page * | ||
243 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | ||
244 | pud_t *pud, int write) | ||
245 | { | ||
246 | struct page *page; | ||
247 | |||
248 | page = pte_page(*(pte_t *)pud); | ||
249 | if (page) | ||
250 | page += ((address & ~PUD_MASK) >> PAGE_SHIFT); | ||
220 | return page; | 251 | return page; |
221 | } | 252 | } |
253 | |||
222 | #endif | 254 | #endif |
223 | 255 | ||
224 | /* x86_64 also uses this file */ | 256 | /* x86_64 also uses this file */ |
@@ -228,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, | |||
228 | unsigned long addr, unsigned long len, | 260 | unsigned long addr, unsigned long len, |
229 | unsigned long pgoff, unsigned long flags) | 261 | unsigned long pgoff, unsigned long flags) |
230 | { | 262 | { |
263 | struct hstate *h = hstate_file(file); | ||
231 | struct mm_struct *mm = current->mm; | 264 | struct mm_struct *mm = current->mm; |
232 | struct vm_area_struct *vma; | 265 | struct vm_area_struct *vma; |
233 | unsigned long start_addr; | 266 | unsigned long start_addr; |
@@ -240,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, | |||
240 | } | 273 | } |
241 | 274 | ||
242 | full_search: | 275 | full_search: |
243 | addr = ALIGN(start_addr, HPAGE_SIZE); | 276 | addr = ALIGN(start_addr, huge_page_size(h)); |
244 | 277 | ||
245 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | 278 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { |
246 | /* At this point: (!vma || addr < vma->vm_end). */ | 279 | /* At this point: (!vma || addr < vma->vm_end). */ |
@@ -262,7 +295,7 @@ full_search: | |||
262 | } | 295 | } |
263 | if (addr + mm->cached_hole_size < vma->vm_start) | 296 | if (addr + mm->cached_hole_size < vma->vm_start) |
264 | mm->cached_hole_size = vma->vm_start - addr; | 297 | mm->cached_hole_size = vma->vm_start - addr; |
265 | addr = ALIGN(vma->vm_end, HPAGE_SIZE); | 298 | addr = ALIGN(vma->vm_end, huge_page_size(h)); |
266 | } | 299 | } |
267 | } | 300 | } |
268 | 301 | ||
@@ -270,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | |||
270 | unsigned long addr0, unsigned long len, | 303 | unsigned long addr0, unsigned long len, |
271 | unsigned long pgoff, unsigned long flags) | 304 | unsigned long pgoff, unsigned long flags) |
272 | { | 305 | { |
306 | struct hstate *h = hstate_file(file); | ||
273 | struct mm_struct *mm = current->mm; | 307 | struct mm_struct *mm = current->mm; |
274 | struct vm_area_struct *vma, *prev_vma; | 308 | struct vm_area_struct *vma, *prev_vma; |
275 | unsigned long base = mm->mmap_base, addr = addr0; | 309 | unsigned long base = mm->mmap_base, addr = addr0; |
@@ -290,7 +324,7 @@ try_again: | |||
290 | goto fail; | 324 | goto fail; |
291 | 325 | ||
292 | /* either no address requested or cant fit in requested address hole */ | 326 | /* either no address requested or cant fit in requested address hole */ |
293 | addr = (mm->free_area_cache - len) & HPAGE_MASK; | 327 | addr = (mm->free_area_cache - len) & huge_page_mask(h); |
294 | do { | 328 | do { |
295 | /* | 329 | /* |
296 | * Lookup failure means no vma is above this address, | 330 | * Lookup failure means no vma is above this address, |
@@ -321,7 +355,7 @@ try_again: | |||
321 | largest_hole = vma->vm_start - addr; | 355 | largest_hole = vma->vm_start - addr; |
322 | 356 | ||
323 | /* try just below the current vma->vm_start */ | 357 | /* try just below the current vma->vm_start */ |
324 | addr = (vma->vm_start - len) & HPAGE_MASK; | 358 | addr = (vma->vm_start - len) & huge_page_mask(h); |
325 | } while (len <= vma->vm_start); | 359 | } while (len <= vma->vm_start); |
326 | 360 | ||
327 | fail: | 361 | fail: |
@@ -359,22 +393,23 @@ unsigned long | |||
359 | hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | 393 | hugetlb_get_unmapped_area(struct file *file, unsigned long addr, |
360 | unsigned long len, unsigned long pgoff, unsigned long flags) | 394 | unsigned long len, unsigned long pgoff, unsigned long flags) |
361 | { | 395 | { |
396 | struct hstate *h = hstate_file(file); | ||
362 | struct mm_struct *mm = current->mm; | 397 | struct mm_struct *mm = current->mm; |
363 | struct vm_area_struct *vma; | 398 | struct vm_area_struct *vma; |
364 | 399 | ||
365 | if (len & ~HPAGE_MASK) | 400 | if (len & ~huge_page_mask(h)) |
366 | return -EINVAL; | 401 | return -EINVAL; |
367 | if (len > TASK_SIZE) | 402 | if (len > TASK_SIZE) |
368 | return -ENOMEM; | 403 | return -ENOMEM; |
369 | 404 | ||
370 | if (flags & MAP_FIXED) { | 405 | if (flags & MAP_FIXED) { |
371 | if (prepare_hugepage_range(addr, len)) | 406 | if (prepare_hugepage_range(file, addr, len)) |
372 | return -EINVAL; | 407 | return -EINVAL; |
373 | return addr; | 408 | return addr; |
374 | } | 409 | } |
375 | 410 | ||
376 | if (addr) { | 411 | if (addr) { |
377 | addr = ALIGN(addr, HPAGE_SIZE); | 412 | addr = ALIGN(addr, huge_page_size(h)); |
378 | vma = find_vma(mm, addr); | 413 | vma = find_vma(mm, addr); |
379 | if (TASK_SIZE - len >= addr && | 414 | if (TASK_SIZE - len >= addr && |
380 | (!vma || addr + len <= vma->vm_start)) | 415 | (!vma || addr + len <= vma->vm_start)) |
@@ -390,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |||
390 | 425 | ||
391 | #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ | 426 | #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ |
392 | 427 | ||
428 | #ifdef CONFIG_X86_64 | ||
429 | static __init int setup_hugepagesz(char *opt) | ||
430 | { | ||
431 | unsigned long ps = memparse(opt, &opt); | ||
432 | if (ps == PMD_SIZE) { | ||
433 | hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); | ||
434 | } else if (ps == PUD_SIZE && cpu_has_gbpages) { | ||
435 | hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); | ||
436 | } else { | ||
437 | printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", | ||
438 | ps >> 20); | ||
439 | return 0; | ||
440 | } | ||
441 | return 1; | ||
442 | } | ||
443 | __setup("hugepagesz=", setup_hugepagesz); | ||
444 | #endif | ||
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ec30d10154b6..bbe044dbe014 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/cpumask.h> | 31 | #include <linux/cpumask.h> |
32 | 32 | ||
33 | #include <asm/asm.h> | 33 | #include <asm/asm.h> |
34 | #include <asm/bios_ebda.h> | ||
34 | #include <asm/processor.h> | 35 | #include <asm/processor.h> |
35 | #include <asm/system.h> | 36 | #include <asm/system.h> |
36 | #include <asm/uaccess.h> | 37 | #include <asm/uaccess.h> |
@@ -47,9 +48,11 @@ | |||
47 | #include <asm/paravirt.h> | 48 | #include <asm/paravirt.h> |
48 | #include <asm/setup.h> | 49 | #include <asm/setup.h> |
49 | #include <asm/cacheflush.h> | 50 | #include <asm/cacheflush.h> |
51 | #include <asm/smp.h> | ||
50 | 52 | ||
51 | unsigned int __VMALLOC_RESERVE = 128 << 20; | 53 | unsigned int __VMALLOC_RESERVE = 128 << 20; |
52 | 54 | ||
55 | unsigned long max_low_pfn_mapped; | ||
53 | unsigned long max_pfn_mapped; | 56 | unsigned long max_pfn_mapped; |
54 | 57 | ||
55 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | 58 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); |
@@ -57,6 +60,27 @@ unsigned long highstart_pfn, highend_pfn; | |||
57 | 60 | ||
58 | static noinline int do_test_wp_bit(void); | 61 | static noinline int do_test_wp_bit(void); |
59 | 62 | ||
63 | |||
64 | static unsigned long __initdata table_start; | ||
65 | static unsigned long __meminitdata table_end; | ||
66 | static unsigned long __meminitdata table_top; | ||
67 | |||
68 | static int __initdata after_init_bootmem; | ||
69 | |||
70 | static __init void *alloc_low_page(unsigned long *phys) | ||
71 | { | ||
72 | unsigned long pfn = table_end++; | ||
73 | void *adr; | ||
74 | |||
75 | if (pfn >= table_top) | ||
76 | panic("alloc_low_page: ran out of memory"); | ||
77 | |||
78 | adr = __va(pfn * PAGE_SIZE); | ||
79 | memset(adr, 0, PAGE_SIZE); | ||
80 | *phys = pfn * PAGE_SIZE; | ||
81 | return adr; | ||
82 | } | ||
83 | |||
60 | /* | 84 | /* |
61 | * Creates a middle page table and puts a pointer to it in the | 85 | * Creates a middle page table and puts a pointer to it in the |
62 | * given global directory entry. This only returns the gd entry | 86 | * given global directory entry. This only returns the gd entry |
@@ -68,9 +92,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) | |||
68 | pmd_t *pmd_table; | 92 | pmd_t *pmd_table; |
69 | 93 | ||
70 | #ifdef CONFIG_X86_PAE | 94 | #ifdef CONFIG_X86_PAE |
95 | unsigned long phys; | ||
71 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { | 96 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { |
72 | pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); | 97 | if (after_init_bootmem) |
73 | 98 | pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); | |
99 | else | ||
100 | pmd_table = (pmd_t *)alloc_low_page(&phys); | ||
74 | paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); | 101 | paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); |
75 | set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | 102 | set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); |
76 | pud = pud_offset(pgd, 0); | 103 | pud = pud_offset(pgd, 0); |
@@ -92,12 +119,16 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) | |||
92 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { | 119 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { |
93 | pte_t *page_table = NULL; | 120 | pte_t *page_table = NULL; |
94 | 121 | ||
122 | if (after_init_bootmem) { | ||
95 | #ifdef CONFIG_DEBUG_PAGEALLOC | 123 | #ifdef CONFIG_DEBUG_PAGEALLOC |
96 | page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); | 124 | page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); |
97 | #endif | 125 | #endif |
98 | if (!page_table) { | 126 | if (!page_table) |
99 | page_table = | 127 | page_table = |
100 | (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); | 128 | (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); |
129 | } else { | ||
130 | unsigned long phys; | ||
131 | page_table = (pte_t *)alloc_low_page(&phys); | ||
101 | } | 132 | } |
102 | 133 | ||
103 | paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); | 134 | paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); |
@@ -155,40 +186,72 @@ static inline int is_kernel_text(unsigned long addr) | |||
155 | * of max_low_pfn pages, by creating page tables starting from address | 186 | * of max_low_pfn pages, by creating page tables starting from address |
156 | * PAGE_OFFSET: | 187 | * PAGE_OFFSET: |
157 | */ | 188 | */ |
158 | static void __init kernel_physical_mapping_init(pgd_t *pgd_base) | 189 | static void __init kernel_physical_mapping_init(pgd_t *pgd_base, |
190 | unsigned long start_pfn, | ||
191 | unsigned long end_pfn, | ||
192 | int use_pse) | ||
159 | { | 193 | { |
160 | int pgd_idx, pmd_idx, pte_ofs; | 194 | int pgd_idx, pmd_idx, pte_ofs; |
161 | unsigned long pfn; | 195 | unsigned long pfn; |
162 | pgd_t *pgd; | 196 | pgd_t *pgd; |
163 | pmd_t *pmd; | 197 | pmd_t *pmd; |
164 | pte_t *pte; | 198 | pte_t *pte; |
199 | unsigned pages_2m, pages_4k; | ||
200 | int mapping_iter; | ||
165 | 201 | ||
166 | pgd_idx = pgd_index(PAGE_OFFSET); | 202 | /* |
167 | pgd = pgd_base + pgd_idx; | 203 | * First iteration will setup identity mapping using large/small pages |
168 | pfn = 0; | 204 | * based on use_pse, with other attributes same as set by |
205 | * the early code in head_32.S | ||
206 | * | ||
207 | * Second iteration will setup the appropriate attributes (NX, GLOBAL..) | ||
208 | * as desired for the kernel identity mapping. | ||
209 | * | ||
210 | * This two pass mechanism conforms to the TLB app note which says: | ||
211 | * | ||
212 | * "Software should not write to a paging-structure entry in a way | ||
213 | * that would change, for any linear address, both the page size | ||
214 | * and either the page frame or attributes." | ||
215 | */ | ||
216 | mapping_iter = 1; | ||
169 | 217 | ||
218 | if (!cpu_has_pse) | ||
219 | use_pse = 0; | ||
220 | |||
221 | repeat: | ||
222 | pages_2m = pages_4k = 0; | ||
223 | pfn = start_pfn; | ||
224 | pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); | ||
225 | pgd = pgd_base + pgd_idx; | ||
170 | for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { | 226 | for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { |
171 | pmd = one_md_table_init(pgd); | 227 | pmd = one_md_table_init(pgd); |
172 | if (pfn >= max_low_pfn) | ||
173 | continue; | ||
174 | 228 | ||
175 | for (pmd_idx = 0; | 229 | if (pfn >= end_pfn) |
176 | pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; | 230 | continue; |
231 | #ifdef CONFIG_X86_PAE | ||
232 | pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); | ||
233 | pmd += pmd_idx; | ||
234 | #else | ||
235 | pmd_idx = 0; | ||
236 | #endif | ||
237 | for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn; | ||
177 | pmd++, pmd_idx++) { | 238 | pmd++, pmd_idx++) { |
178 | unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; | 239 | unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; |
179 | 240 | ||
180 | /* | 241 | /* |
181 | * Map with big pages if possible, otherwise | 242 | * Map with big pages if possible, otherwise |
182 | * create normal page tables: | 243 | * create normal page tables: |
183 | * | ||
184 | * Don't use a large page for the first 2/4MB of memory | ||
185 | * because there are often fixed size MTRRs in there | ||
186 | * and overlapping MTRRs into large pages can cause | ||
187 | * slowdowns. | ||
188 | */ | 244 | */ |
189 | if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) { | 245 | if (use_pse) { |
190 | unsigned int addr2; | 246 | unsigned int addr2; |
191 | pgprot_t prot = PAGE_KERNEL_LARGE; | 247 | pgprot_t prot = PAGE_KERNEL_LARGE; |
248 | /* | ||
249 | * first pass will use the same initial | ||
250 | * identity mapping attribute + _PAGE_PSE. | ||
251 | */ | ||
252 | pgprot_t init_prot = | ||
253 | __pgprot(PTE_IDENT_ATTR | | ||
254 | _PAGE_PSE); | ||
192 | 255 | ||
193 | addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + | 256 | addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + |
194 | PAGE_OFFSET + PAGE_SIZE-1; | 257 | PAGE_OFFSET + PAGE_SIZE-1; |
@@ -197,34 +260,59 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) | |||
197 | is_kernel_text(addr2)) | 260 | is_kernel_text(addr2)) |
198 | prot = PAGE_KERNEL_LARGE_EXEC; | 261 | prot = PAGE_KERNEL_LARGE_EXEC; |
199 | 262 | ||
200 | set_pmd(pmd, pfn_pmd(pfn, prot)); | 263 | pages_2m++; |
264 | if (mapping_iter == 1) | ||
265 | set_pmd(pmd, pfn_pmd(pfn, init_prot)); | ||
266 | else | ||
267 | set_pmd(pmd, pfn_pmd(pfn, prot)); | ||
201 | 268 | ||
202 | pfn += PTRS_PER_PTE; | 269 | pfn += PTRS_PER_PTE; |
203 | max_pfn_mapped = pfn; | ||
204 | continue; | 270 | continue; |
205 | } | 271 | } |
206 | pte = one_page_table_init(pmd); | 272 | pte = one_page_table_init(pmd); |
207 | 273 | ||
208 | for (pte_ofs = 0; | 274 | pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); |
209 | pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; | 275 | pte += pte_ofs; |
276 | for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; | ||
210 | pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { | 277 | pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { |
211 | pgprot_t prot = PAGE_KERNEL; | 278 | pgprot_t prot = PAGE_KERNEL; |
279 | /* | ||
280 | * first pass will use the same initial | ||
281 | * identity mapping attribute. | ||
282 | */ | ||
283 | pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR); | ||
212 | 284 | ||
213 | if (is_kernel_text(addr)) | 285 | if (is_kernel_text(addr)) |
214 | prot = PAGE_KERNEL_EXEC; | 286 | prot = PAGE_KERNEL_EXEC; |
215 | 287 | ||
216 | set_pte(pte, pfn_pte(pfn, prot)); | 288 | pages_4k++; |
289 | if (mapping_iter == 1) | ||
290 | set_pte(pte, pfn_pte(pfn, init_prot)); | ||
291 | else | ||
292 | set_pte(pte, pfn_pte(pfn, prot)); | ||
217 | } | 293 | } |
218 | max_pfn_mapped = pfn; | ||
219 | } | 294 | } |
220 | } | 295 | } |
221 | } | 296 | if (mapping_iter == 1) { |
297 | /* | ||
298 | * update direct mapping page count only in the first | ||
299 | * iteration. | ||
300 | */ | ||
301 | update_page_count(PG_LEVEL_2M, pages_2m); | ||
302 | update_page_count(PG_LEVEL_4K, pages_4k); | ||
222 | 303 | ||
223 | static inline int page_kills_ppro(unsigned long pagenr) | 304 | /* |
224 | { | 305 | * local global flush tlb, which will flush the previous |
225 | if (pagenr >= 0x70000 && pagenr <= 0x7003F) | 306 | * mappings present in both small and large page TLB's. |
226 | return 1; | 307 | */ |
227 | return 0; | 308 | __flush_tlb_all(); |
309 | |||
310 | /* | ||
311 | * Second iteration will set the actual desired PTE attributes. | ||
312 | */ | ||
313 | mapping_iter = 2; | ||
314 | goto repeat; | ||
315 | } | ||
228 | } | 316 | } |
229 | 317 | ||
230 | /* | 318 | /* |
@@ -287,29 +375,62 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) | |||
287 | pkmap_page_table = pte; | 375 | pkmap_page_table = pte; |
288 | } | 376 | } |
289 | 377 | ||
290 | void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) | 378 | static void __init add_one_highpage_init(struct page *page, int pfn) |
291 | { | 379 | { |
292 | if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { | 380 | ClearPageReserved(page); |
293 | ClearPageReserved(page); | 381 | init_page_count(page); |
294 | init_page_count(page); | 382 | __free_page(page); |
295 | __free_page(page); | 383 | totalhigh_pages++; |
296 | totalhigh_pages++; | ||
297 | } else | ||
298 | SetPageReserved(page); | ||
299 | } | 384 | } |
300 | 385 | ||
301 | #ifndef CONFIG_NUMA | 386 | struct add_highpages_data { |
302 | static void __init set_highmem_pages_init(int bad_ppro) | 387 | unsigned long start_pfn; |
388 | unsigned long end_pfn; | ||
389 | }; | ||
390 | |||
391 | static int __init add_highpages_work_fn(unsigned long start_pfn, | ||
392 | unsigned long end_pfn, void *datax) | ||
303 | { | 393 | { |
304 | int pfn; | 394 | int node_pfn; |
395 | struct page *page; | ||
396 | unsigned long final_start_pfn, final_end_pfn; | ||
397 | struct add_highpages_data *data; | ||
305 | 398 | ||
306 | for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { | 399 | data = (struct add_highpages_data *)datax; |
307 | /* | 400 | |
308 | * Holes under sparsemem might not have no mem_map[]: | 401 | final_start_pfn = max(start_pfn, data->start_pfn); |
309 | */ | 402 | final_end_pfn = min(end_pfn, data->end_pfn); |
310 | if (pfn_valid(pfn)) | 403 | if (final_start_pfn >= final_end_pfn) |
311 | add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); | 404 | return 0; |
405 | |||
406 | for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; | ||
407 | node_pfn++) { | ||
408 | if (!pfn_valid(node_pfn)) | ||
409 | continue; | ||
410 | page = pfn_to_page(node_pfn); | ||
411 | add_one_highpage_init(page, node_pfn); | ||
312 | } | 412 | } |
413 | |||
414 | return 0; | ||
415 | |||
416 | } | ||
417 | |||
418 | void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, | ||
419 | unsigned long end_pfn) | ||
420 | { | ||
421 | struct add_highpages_data data; | ||
422 | |||
423 | data.start_pfn = start_pfn; | ||
424 | data.end_pfn = end_pfn; | ||
425 | |||
426 | work_with_active_regions(nid, add_highpages_work_fn, &data); | ||
427 | } | ||
428 | |||
429 | #ifndef CONFIG_NUMA | ||
430 | static void __init set_highmem_pages_init(void) | ||
431 | { | ||
432 | add_highpages_with_active_regions(0, highstart_pfn, highend_pfn); | ||
433 | |||
313 | totalram_pages += totalhigh_pages; | 434 | totalram_pages += totalhigh_pages; |
314 | } | 435 | } |
315 | #endif /* !CONFIG_NUMA */ | 436 | #endif /* !CONFIG_NUMA */ |
@@ -317,14 +438,9 @@ static void __init set_highmem_pages_init(int bad_ppro) | |||
317 | #else | 438 | #else |
318 | # define kmap_init() do { } while (0) | 439 | # define kmap_init() do { } while (0) |
319 | # define permanent_kmaps_init(pgd_base) do { } while (0) | 440 | # define permanent_kmaps_init(pgd_base) do { } while (0) |
320 | # define set_highmem_pages_init(bad_ppro) do { } while (0) | 441 | # define set_highmem_pages_init() do { } while (0) |
321 | #endif /* CONFIG_HIGHMEM */ | 442 | #endif /* CONFIG_HIGHMEM */ |
322 | 443 | ||
323 | pteval_t __PAGE_KERNEL = _PAGE_KERNEL; | ||
324 | EXPORT_SYMBOL(__PAGE_KERNEL); | ||
325 | |||
326 | pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; | ||
327 | |||
328 | void __init native_pagetable_setup_start(pgd_t *base) | 444 | void __init native_pagetable_setup_start(pgd_t *base) |
329 | { | 445 | { |
330 | unsigned long pfn, va; | 446 | unsigned long pfn, va; |
@@ -380,27 +496,10 @@ void __init native_pagetable_setup_done(pgd_t *base) | |||
380 | * be partially populated, and so it avoids stomping on any existing | 496 | * be partially populated, and so it avoids stomping on any existing |
381 | * mappings. | 497 | * mappings. |
382 | */ | 498 | */ |
383 | static void __init pagetable_init(void) | 499 | static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base) |
384 | { | 500 | { |
385 | pgd_t *pgd_base = swapper_pg_dir; | ||
386 | unsigned long vaddr, end; | 501 | unsigned long vaddr, end; |
387 | 502 | ||
388 | paravirt_pagetable_setup_start(pgd_base); | ||
389 | |||
390 | /* Enable PSE if available */ | ||
391 | if (cpu_has_pse) | ||
392 | set_in_cr4(X86_CR4_PSE); | ||
393 | |||
394 | /* Enable PGE if available */ | ||
395 | if (cpu_has_pge) { | ||
396 | set_in_cr4(X86_CR4_PGE); | ||
397 | __PAGE_KERNEL |= _PAGE_GLOBAL; | ||
398 | __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; | ||
399 | } | ||
400 | |||
401 | kernel_physical_mapping_init(pgd_base); | ||
402 | remap_numa_kva(); | ||
403 | |||
404 | /* | 503 | /* |
405 | * Fixed mappings, only the page table structure has to be | 504 | * Fixed mappings, only the page table structure has to be |
406 | * created - mappings will be set by set_fixmap(): | 505 | * created - mappings will be set by set_fixmap(): |
@@ -410,10 +509,13 @@ static void __init pagetable_init(void) | |||
410 | end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; | 509 | end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; |
411 | page_table_range_init(vaddr, end, pgd_base); | 510 | page_table_range_init(vaddr, end, pgd_base); |
412 | early_ioremap_reset(); | 511 | early_ioremap_reset(); |
512 | } | ||
413 | 513 | ||
414 | permanent_kmaps_init(pgd_base); | 514 | static void __init pagetable_init(void) |
515 | { | ||
516 | pgd_t *pgd_base = swapper_pg_dir; | ||
415 | 517 | ||
416 | paravirt_pagetable_setup_done(pgd_base); | 518 | permanent_kmaps_init(pgd_base); |
417 | } | 519 | } |
418 | 520 | ||
419 | #ifdef CONFIG_ACPI_SLEEP | 521 | #ifdef CONFIG_ACPI_SLEEP |
@@ -456,7 +558,7 @@ void zap_low_mappings(void) | |||
456 | 558 | ||
457 | int nx_enabled; | 559 | int nx_enabled; |
458 | 560 | ||
459 | pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; | 561 | pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); |
460 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | 562 | EXPORT_SYMBOL_GPL(__supported_pte_mask); |
461 | 563 | ||
462 | #ifdef CONFIG_X86_PAE | 564 | #ifdef CONFIG_X86_PAE |
@@ -509,27 +611,329 @@ static void __init set_nx(void) | |||
509 | } | 611 | } |
510 | #endif | 612 | #endif |
511 | 613 | ||
614 | /* user-defined highmem size */ | ||
615 | static unsigned int highmem_pages = -1; | ||
616 | |||
512 | /* | 617 | /* |
513 | * paging_init() sets up the page tables - note that the first 8MB are | 618 | * highmem=size forces highmem to be exactly 'size' bytes. |
514 | * already mapped by head.S. | 619 | * This works even on boxes that have no highmem otherwise. |
515 | * | 620 | * This also works to reduce highmem size on bigger boxes. |
516 | * This routines also unmaps the page at virtual kernel address 0, so | ||
517 | * that we can trap those pesky NULL-reference errors in the kernel. | ||
518 | */ | 621 | */ |
519 | void __init paging_init(void) | 622 | static int __init parse_highmem(char *arg) |
623 | { | ||
624 | if (!arg) | ||
625 | return -EINVAL; | ||
626 | |||
627 | highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT; | ||
628 | return 0; | ||
629 | } | ||
630 | early_param("highmem", parse_highmem); | ||
631 | |||
632 | /* | ||
633 | * Determine low and high memory ranges: | ||
634 | */ | ||
635 | void __init find_low_pfn_range(void) | ||
520 | { | 636 | { |
637 | /* it could update max_pfn */ | ||
638 | |||
639 | /* max_low_pfn is 0, we already have early_res support */ | ||
640 | |||
641 | max_low_pfn = max_pfn; | ||
642 | if (max_low_pfn > MAXMEM_PFN) { | ||
643 | if (highmem_pages == -1) | ||
644 | highmem_pages = max_pfn - MAXMEM_PFN; | ||
645 | if (highmem_pages + MAXMEM_PFN < max_pfn) | ||
646 | max_pfn = MAXMEM_PFN + highmem_pages; | ||
647 | if (highmem_pages + MAXMEM_PFN > max_pfn) { | ||
648 | printk(KERN_WARNING "only %luMB highmem pages " | ||
649 | "available, ignoring highmem size of %uMB.\n", | ||
650 | pages_to_mb(max_pfn - MAXMEM_PFN), | ||
651 | pages_to_mb(highmem_pages)); | ||
652 | highmem_pages = 0; | ||
653 | } | ||
654 | max_low_pfn = MAXMEM_PFN; | ||
655 | #ifndef CONFIG_HIGHMEM | ||
656 | /* Maximum memory usable is what is directly addressable */ | ||
657 | printk(KERN_WARNING "Warning only %ldMB will be used.\n", | ||
658 | MAXMEM>>20); | ||
659 | if (max_pfn > MAX_NONPAE_PFN) | ||
660 | printk(KERN_WARNING | ||
661 | "Use a HIGHMEM64G enabled kernel.\n"); | ||
662 | else | ||
663 | printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); | ||
664 | max_pfn = MAXMEM_PFN; | ||
665 | #else /* !CONFIG_HIGHMEM */ | ||
666 | #ifndef CONFIG_HIGHMEM64G | ||
667 | if (max_pfn > MAX_NONPAE_PFN) { | ||
668 | max_pfn = MAX_NONPAE_PFN; | ||
669 | printk(KERN_WARNING "Warning only 4GB will be used." | ||
670 | "Use a HIGHMEM64G enabled kernel.\n"); | ||
671 | } | ||
672 | #endif /* !CONFIG_HIGHMEM64G */ | ||
673 | #endif /* !CONFIG_HIGHMEM */ | ||
674 | } else { | ||
675 | if (highmem_pages == -1) | ||
676 | highmem_pages = 0; | ||
677 | #ifdef CONFIG_HIGHMEM | ||
678 | if (highmem_pages >= max_pfn) { | ||
679 | printk(KERN_ERR "highmem size specified (%uMB) is " | ||
680 | "bigger than pages available (%luMB)!.\n", | ||
681 | pages_to_mb(highmem_pages), | ||
682 | pages_to_mb(max_pfn)); | ||
683 | highmem_pages = 0; | ||
684 | } | ||
685 | if (highmem_pages) { | ||
686 | if (max_low_pfn - highmem_pages < | ||
687 | 64*1024*1024/PAGE_SIZE){ | ||
688 | printk(KERN_ERR "highmem size %uMB results in " | ||
689 | "smaller than 64MB lowmem, ignoring it.\n" | ||
690 | , pages_to_mb(highmem_pages)); | ||
691 | highmem_pages = 0; | ||
692 | } | ||
693 | max_low_pfn -= highmem_pages; | ||
694 | } | ||
695 | #else | ||
696 | if (highmem_pages) | ||
697 | printk(KERN_ERR "ignoring highmem size on non-highmem" | ||
698 | " kernel!\n"); | ||
699 | #endif | ||
700 | } | ||
701 | } | ||
702 | |||
703 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
704 | void __init initmem_init(unsigned long start_pfn, | ||
705 | unsigned long end_pfn) | ||
706 | { | ||
707 | #ifdef CONFIG_HIGHMEM | ||
708 | highstart_pfn = highend_pfn = max_pfn; | ||
709 | if (max_pfn > max_low_pfn) | ||
710 | highstart_pfn = max_low_pfn; | ||
711 | memory_present(0, 0, highend_pfn); | ||
712 | e820_register_active_regions(0, 0, highend_pfn); | ||
713 | printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", | ||
714 | pages_to_mb(highend_pfn - highstart_pfn)); | ||
715 | num_physpages = highend_pfn; | ||
716 | high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; | ||
717 | #else | ||
718 | memory_present(0, 0, max_low_pfn); | ||
719 | e820_register_active_regions(0, 0, max_low_pfn); | ||
720 | num_physpages = max_low_pfn; | ||
721 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; | ||
722 | #endif | ||
723 | #ifdef CONFIG_FLATMEM | ||
724 | max_mapnr = num_physpages; | ||
725 | #endif | ||
726 | printk(KERN_NOTICE "%ldMB LOWMEM available.\n", | ||
727 | pages_to_mb(max_low_pfn)); | ||
728 | |||
729 | setup_bootmem_allocator(); | ||
730 | } | ||
731 | #endif /* !CONFIG_NEED_MULTIPLE_NODES */ | ||
732 | |||
733 | static void __init zone_sizes_init(void) | ||
734 | { | ||
735 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | ||
736 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | ||
737 | max_zone_pfns[ZONE_DMA] = | ||
738 | virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; | ||
739 | max_zone_pfns[ZONE_NORMAL] = max_low_pfn; | ||
740 | #ifdef CONFIG_HIGHMEM | ||
741 | max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; | ||
742 | #endif | ||
743 | |||
744 | free_area_init_nodes(max_zone_pfns); | ||
745 | } | ||
746 | |||
747 | void __init setup_bootmem_allocator(void) | ||
748 | { | ||
749 | int i; | ||
750 | unsigned long bootmap_size, bootmap; | ||
751 | /* | ||
752 | * Initialize the boot-time allocator (with low memory only): | ||
753 | */ | ||
754 | bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; | ||
755 | bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT, | ||
756 | max_pfn_mapped<<PAGE_SHIFT, bootmap_size, | ||
757 | PAGE_SIZE); | ||
758 | if (bootmap == -1L) | ||
759 | panic("Cannot find bootmem map of size %ld\n", bootmap_size); | ||
760 | reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); | ||
761 | |||
762 | /* don't touch min_low_pfn */ | ||
763 | bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, | ||
764 | min_low_pfn, max_low_pfn); | ||
765 | printk(KERN_INFO " mapped low ram: 0 - %08lx\n", | ||
766 | max_pfn_mapped<<PAGE_SHIFT); | ||
767 | printk(KERN_INFO " low ram: %08lx - %08lx\n", | ||
768 | min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT); | ||
769 | printk(KERN_INFO " bootmap %08lx - %08lx\n", | ||
770 | bootmap, bootmap + bootmap_size); | ||
771 | for_each_online_node(i) | ||
772 | free_bootmem_with_active_regions(i, max_low_pfn); | ||
773 | early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT); | ||
774 | |||
775 | after_init_bootmem = 1; | ||
776 | } | ||
777 | |||
778 | static void __init find_early_table_space(unsigned long end, int use_pse) | ||
779 | { | ||
780 | unsigned long puds, pmds, ptes, tables, start; | ||
781 | |||
782 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | ||
783 | tables = PAGE_ALIGN(puds * sizeof(pud_t)); | ||
784 | |||
785 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | ||
786 | tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); | ||
787 | |||
788 | if (use_pse) { | ||
789 | unsigned long extra; | ||
790 | |||
791 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); | ||
792 | extra += PMD_SIZE; | ||
793 | ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
794 | } else | ||
795 | ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
796 | |||
797 | tables += PAGE_ALIGN(ptes * sizeof(pte_t)); | ||
798 | |||
799 | /* for fixmap */ | ||
800 | tables += PAGE_SIZE * 2; | ||
801 | |||
802 | /* | ||
803 | * RED-PEN putting page tables only on node 0 could | ||
804 | * cause a hotspot and fill up ZONE_DMA. The page tables | ||
805 | * need roughly 0.5KB per GB. | ||
806 | */ | ||
807 | start = 0x7000; | ||
808 | table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, | ||
809 | tables, PAGE_SIZE); | ||
810 | if (table_start == -1UL) | ||
811 | panic("Cannot find space for the kernel page tables"); | ||
812 | |||
813 | table_start >>= PAGE_SHIFT; | ||
814 | table_end = table_start; | ||
815 | table_top = table_start + (tables>>PAGE_SHIFT); | ||
816 | |||
817 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", | ||
818 | end, table_start << PAGE_SHIFT, | ||
819 | (table_start << PAGE_SHIFT) + tables); | ||
820 | } | ||
821 | |||
822 | unsigned long __init_refok init_memory_mapping(unsigned long start, | ||
823 | unsigned long end) | ||
824 | { | ||
825 | pgd_t *pgd_base = swapper_pg_dir; | ||
826 | unsigned long start_pfn, end_pfn; | ||
827 | unsigned long big_page_start; | ||
828 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
829 | /* | ||
830 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | ||
831 | * This will simplify cpa(), which otherwise needs to support splitting | ||
832 | * large pages into small in interrupt context, etc. | ||
833 | */ | ||
834 | int use_pse = 0; | ||
835 | #else | ||
836 | int use_pse = cpu_has_pse; | ||
837 | #endif | ||
838 | |||
839 | /* | ||
840 | * Find space for the kernel direct mapping tables. | ||
841 | */ | ||
842 | if (!after_init_bootmem) | ||
843 | find_early_table_space(end, use_pse); | ||
844 | |||
521 | #ifdef CONFIG_X86_PAE | 845 | #ifdef CONFIG_X86_PAE |
522 | set_nx(); | 846 | set_nx(); |
523 | if (nx_enabled) | 847 | if (nx_enabled) |
524 | printk(KERN_INFO "NX (Execute Disable) protection: active\n"); | 848 | printk(KERN_INFO "NX (Execute Disable) protection: active\n"); |
525 | #endif | 849 | #endif |
526 | pagetable_init(); | 850 | |
851 | /* Enable PSE if available */ | ||
852 | if (cpu_has_pse) | ||
853 | set_in_cr4(X86_CR4_PSE); | ||
854 | |||
855 | /* Enable PGE if available */ | ||
856 | if (cpu_has_pge) { | ||
857 | set_in_cr4(X86_CR4_PGE); | ||
858 | __supported_pte_mask |= _PAGE_GLOBAL; | ||
859 | } | ||
860 | |||
861 | /* | ||
862 | * Don't use a large page for the first 2/4MB of memory | ||
863 | * because there are often fixed size MTRRs in there | ||
864 | * and overlapping MTRRs into large pages can cause | ||
865 | * slowdowns. | ||
866 | */ | ||
867 | big_page_start = PMD_SIZE; | ||
868 | |||
869 | if (start < big_page_start) { | ||
870 | start_pfn = start >> PAGE_SHIFT; | ||
871 | end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT); | ||
872 | } else { | ||
873 | /* head is not big page alignment ? */ | ||
874 | start_pfn = start >> PAGE_SHIFT; | ||
875 | end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
876 | << (PMD_SHIFT - PAGE_SHIFT); | ||
877 | } | ||
878 | if (start_pfn < end_pfn) | ||
879 | kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0); | ||
880 | |||
881 | /* big page range */ | ||
882 | start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
883 | << (PMD_SHIFT - PAGE_SHIFT); | ||
884 | if (start_pfn < (big_page_start >> PAGE_SHIFT)) | ||
885 | start_pfn = big_page_start >> PAGE_SHIFT; | ||
886 | end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); | ||
887 | if (start_pfn < end_pfn) | ||
888 | kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, | ||
889 | use_pse); | ||
890 | |||
891 | /* tail is not big page alignment ? */ | ||
892 | start_pfn = end_pfn; | ||
893 | if (start_pfn > (big_page_start>>PAGE_SHIFT)) { | ||
894 | end_pfn = end >> PAGE_SHIFT; | ||
895 | if (start_pfn < end_pfn) | ||
896 | kernel_physical_mapping_init(pgd_base, start_pfn, | ||
897 | end_pfn, 0); | ||
898 | } | ||
899 | |||
900 | early_ioremap_page_table_range_init(pgd_base); | ||
527 | 901 | ||
528 | load_cr3(swapper_pg_dir); | 902 | load_cr3(swapper_pg_dir); |
529 | 903 | ||
530 | __flush_tlb_all(); | 904 | __flush_tlb_all(); |
531 | 905 | ||
906 | if (!after_init_bootmem) | ||
907 | reserve_early(table_start << PAGE_SHIFT, | ||
908 | table_end << PAGE_SHIFT, "PGTABLE"); | ||
909 | |||
910 | if (!after_init_bootmem) | ||
911 | early_memtest(start, end); | ||
912 | |||
913 | return end >> PAGE_SHIFT; | ||
914 | } | ||
915 | |||
916 | |||
917 | /* | ||
918 | * paging_init() sets up the page tables - note that the first 8MB are | ||
919 | * already mapped by head.S. | ||
920 | * | ||
921 | * This routines also unmaps the page at virtual kernel address 0, so | ||
922 | * that we can trap those pesky NULL-reference errors in the kernel. | ||
923 | */ | ||
924 | void __init paging_init(void) | ||
925 | { | ||
926 | pagetable_init(); | ||
927 | |||
928 | __flush_tlb_all(); | ||
929 | |||
532 | kmap_init(); | 930 | kmap_init(); |
931 | |||
932 | /* | ||
933 | * NOTE: at this point the bootmem allocator is fully available. | ||
934 | */ | ||
935 | sparse_init(); | ||
936 | zone_sizes_init(); | ||
533 | } | 937 | } |
534 | 938 | ||
535 | /* | 939 | /* |
@@ -564,24 +968,13 @@ static struct kcore_list kcore_mem, kcore_vmalloc; | |||
564 | void __init mem_init(void) | 968 | void __init mem_init(void) |
565 | { | 969 | { |
566 | int codesize, reservedpages, datasize, initsize; | 970 | int codesize, reservedpages, datasize, initsize; |
567 | int tmp, bad_ppro; | 971 | int tmp; |
972 | |||
973 | start_periodic_check_for_corruption(); | ||
568 | 974 | ||
569 | #ifdef CONFIG_FLATMEM | 975 | #ifdef CONFIG_FLATMEM |
570 | BUG_ON(!mem_map); | 976 | BUG_ON(!mem_map); |
571 | #endif | 977 | #endif |
572 | bad_ppro = ppro_with_ram_bug(); | ||
573 | |||
574 | #ifdef CONFIG_HIGHMEM | ||
575 | /* check that fixmap and pkmap do not overlap */ | ||
576 | if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { | ||
577 | printk(KERN_ERR | ||
578 | "fixmap and kmap areas overlap - this will crash\n"); | ||
579 | printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", | ||
580 | PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE, | ||
581 | FIXADDR_START); | ||
582 | BUG(); | ||
583 | } | ||
584 | #endif | ||
585 | /* this will put all low memory onto the freelists */ | 978 | /* this will put all low memory onto the freelists */ |
586 | totalram_pages += free_all_bootmem(); | 979 | totalram_pages += free_all_bootmem(); |
587 | 980 | ||
@@ -593,7 +986,7 @@ void __init mem_init(void) | |||
593 | if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) | 986 | if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) |
594 | reservedpages++; | 987 | reservedpages++; |
595 | 988 | ||
596 | set_highmem_pages_init(bad_ppro); | 989 | set_highmem_pages_init(); |
597 | 990 | ||
598 | codesize = (unsigned long) &_etext - (unsigned long) &_text; | 991 | codesize = (unsigned long) &_etext - (unsigned long) &_text; |
599 | datasize = (unsigned long) &_edata - (unsigned long) &_etext; | 992 | datasize = (unsigned long) &_edata - (unsigned long) &_etext; |
@@ -614,7 +1007,6 @@ void __init mem_init(void) | |||
614 | (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) | 1007 | (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) |
615 | ); | 1008 | ); |
616 | 1009 | ||
617 | #if 1 /* double-sanity-check paranoia */ | ||
618 | printk(KERN_INFO "virtual kernel memory layout:\n" | 1010 | printk(KERN_INFO "virtual kernel memory layout:\n" |
619 | " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" | 1011 | " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" |
620 | #ifdef CONFIG_HIGHMEM | 1012 | #ifdef CONFIG_HIGHMEM |
@@ -655,12 +1047,10 @@ void __init mem_init(void) | |||
655 | #endif | 1047 | #endif |
656 | BUG_ON(VMALLOC_START > VMALLOC_END); | 1048 | BUG_ON(VMALLOC_START > VMALLOC_END); |
657 | BUG_ON((unsigned long)high_memory > VMALLOC_START); | 1049 | BUG_ON((unsigned long)high_memory > VMALLOC_START); |
658 | #endif /* double-sanity-check paranoia */ | ||
659 | 1050 | ||
660 | if (boot_cpu_data.wp_works_ok < 0) | 1051 | if (boot_cpu_data.wp_works_ok < 0) |
661 | test_wp_bit(); | 1052 | test_wp_bit(); |
662 | 1053 | ||
663 | cpa_init(); | ||
664 | save_pg_dir(); | 1054 | save_pg_dir(); |
665 | zap_low_mappings(); | 1055 | zap_low_mappings(); |
666 | } | 1056 | } |
@@ -710,6 +1100,8 @@ void mark_rodata_ro(void) | |||
710 | unsigned long start = PFN_ALIGN(_text); | 1100 | unsigned long start = PFN_ALIGN(_text); |
711 | unsigned long size = PFN_ALIGN(_etext) - start; | 1101 | unsigned long size = PFN_ALIGN(_etext) - start; |
712 | 1102 | ||
1103 | #ifndef CONFIG_DYNAMIC_FTRACE | ||
1104 | /* Dynamic tracing modifies the kernel text section */ | ||
713 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); | 1105 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); |
714 | printk(KERN_INFO "Write protecting the kernel text: %luk\n", | 1106 | printk(KERN_INFO "Write protecting the kernel text: %luk\n", |
715 | size >> 10); | 1107 | size >> 10); |
@@ -722,6 +1114,8 @@ void mark_rodata_ro(void) | |||
722 | printk(KERN_INFO "Testing CPA: write protecting again\n"); | 1114 | printk(KERN_INFO "Testing CPA: write protecting again\n"); |
723 | set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); | 1115 | set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); |
724 | #endif | 1116 | #endif |
1117 | #endif /* CONFIG_DYNAMIC_FTRACE */ | ||
1118 | |||
725 | start += size; | 1119 | start += size; |
726 | size = (unsigned long)__end_rodata - start; | 1120 | size = (unsigned long)__end_rodata - start; |
727 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); | 1121 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); |
@@ -784,3 +1178,9 @@ void free_initrd_mem(unsigned long start, unsigned long end) | |||
784 | free_init_pages("initrd memory", start, end); | 1178 | free_init_pages("initrd memory", start, end); |
785 | } | 1179 | } |
786 | #endif | 1180 | #endif |
1181 | |||
1182 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | ||
1183 | int flags) | ||
1184 | { | ||
1185 | return reserve_bootmem(phys, len, flags); | ||
1186 | } | ||
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 156e6d7b0e32..3e10054c5731 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/swap.h> | 18 | #include <linux/swap.h> |
19 | #include <linux/smp.h> | 19 | #include <linux/smp.h> |
20 | #include <linux/init.h> | 20 | #include <linux/init.h> |
21 | #include <linux/initrd.h> | ||
21 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
22 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
23 | #include <linux/proc_fs.h> | 24 | #include <linux/proc_fs.h> |
@@ -30,6 +31,7 @@ | |||
30 | #include <linux/nmi.h> | 31 | #include <linux/nmi.h> |
31 | 32 | ||
32 | #include <asm/processor.h> | 33 | #include <asm/processor.h> |
34 | #include <asm/bios_ebda.h> | ||
33 | #include <asm/system.h> | 35 | #include <asm/system.h> |
34 | #include <asm/uaccess.h> | 36 | #include <asm/uaccess.h> |
35 | #include <asm/pgtable.h> | 37 | #include <asm/pgtable.h> |
@@ -47,11 +49,19 @@ | |||
47 | #include <asm/numa.h> | 49 | #include <asm/numa.h> |
48 | #include <asm/cacheflush.h> | 50 | #include <asm/cacheflush.h> |
49 | 51 | ||
52 | /* | ||
53 | * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. | ||
54 | * The direct mapping extends to max_pfn_mapped, so that we can directly access | ||
55 | * apertures, ACPI and other tables without having to play with fixmaps. | ||
56 | */ | ||
57 | unsigned long max_low_pfn_mapped; | ||
58 | unsigned long max_pfn_mapped; | ||
59 | |||
50 | static unsigned long dma_reserve __initdata; | 60 | static unsigned long dma_reserve __initdata; |
51 | 61 | ||
52 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | 62 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); |
53 | 63 | ||
54 | int direct_gbpages __meminitdata | 64 | int direct_gbpages |
55 | #ifdef CONFIG_DIRECT_GBPAGES | 65 | #ifdef CONFIG_DIRECT_GBPAGES |
56 | = 1 | 66 | = 1 |
57 | #endif | 67 | #endif |
@@ -77,46 +87,69 @@ early_param("gbpages", parse_direct_gbpages_on); | |||
77 | * around without checking the pgd every time. | 87 | * around without checking the pgd every time. |
78 | */ | 88 | */ |
79 | 89 | ||
80 | void show_mem(void) | 90 | int after_bootmem; |
81 | { | ||
82 | long i, total = 0, reserved = 0; | ||
83 | long shared = 0, cached = 0; | ||
84 | struct page *page; | ||
85 | pg_data_t *pgdat; | ||
86 | 91 | ||
87 | printk(KERN_INFO "Mem-info:\n"); | 92 | unsigned long __supported_pte_mask __read_mostly = ~0UL; |
88 | show_free_areas(); | 93 | EXPORT_SYMBOL_GPL(__supported_pte_mask); |
89 | for_each_online_pgdat(pgdat) { | ||
90 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | ||
91 | /* | ||
92 | * This loop can take a while with 256 GB and | ||
93 | * 4k pages so defer the NMI watchdog: | ||
94 | */ | ||
95 | if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) | ||
96 | touch_nmi_watchdog(); | ||
97 | 94 | ||
98 | if (!pfn_valid(pgdat->node_start_pfn + i)) | 95 | static int do_not_nx __cpuinitdata; |
99 | continue; | ||
100 | 96 | ||
101 | page = pfn_to_page(pgdat->node_start_pfn + i); | 97 | /* |
102 | total++; | 98 | * noexec=on|off |
103 | if (PageReserved(page)) | 99 | * Control non-executable mappings for 64-bit processes. |
104 | reserved++; | 100 | * |
105 | else if (PageSwapCache(page)) | 101 | * on Enable (default) |
106 | cached++; | 102 | * off Disable |
107 | else if (page_count(page)) | 103 | */ |
108 | shared += page_count(page) - 1; | 104 | static int __init nonx_setup(char *str) |
109 | } | 105 | { |
106 | if (!str) | ||
107 | return -EINVAL; | ||
108 | if (!strncmp(str, "on", 2)) { | ||
109 | __supported_pte_mask |= _PAGE_NX; | ||
110 | do_not_nx = 0; | ||
111 | } else if (!strncmp(str, "off", 3)) { | ||
112 | do_not_nx = 1; | ||
113 | __supported_pte_mask &= ~_PAGE_NX; | ||
110 | } | 114 | } |
111 | printk(KERN_INFO "%lu pages of RAM\n", total); | 115 | return 0; |
112 | printk(KERN_INFO "%lu reserved pages\n", reserved); | ||
113 | printk(KERN_INFO "%lu pages shared\n", shared); | ||
114 | printk(KERN_INFO "%lu pages swap cached\n", cached); | ||
115 | } | 116 | } |
117 | early_param("noexec", nonx_setup); | ||
116 | 118 | ||
117 | int after_bootmem; | 119 | void __cpuinit check_efer(void) |
120 | { | ||
121 | unsigned long efer; | ||
122 | |||
123 | rdmsrl(MSR_EFER, efer); | ||
124 | if (!(efer & EFER_NX) || do_not_nx) | ||
125 | __supported_pte_mask &= ~_PAGE_NX; | ||
126 | } | ||
118 | 127 | ||
119 | static __init void *spp_getpage(void) | 128 | int force_personality32; |
129 | |||
130 | /* | ||
131 | * noexec32=on|off | ||
132 | * Control non executable heap for 32bit processes. | ||
133 | * To control the stack too use noexec=off | ||
134 | * | ||
135 | * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) | ||
136 | * off PROT_READ implies PROT_EXEC | ||
137 | */ | ||
138 | static int __init nonx32_setup(char *str) | ||
139 | { | ||
140 | if (!strcmp(str, "on")) | ||
141 | force_personality32 &= ~READ_IMPLIES_EXEC; | ||
142 | else if (!strcmp(str, "off")) | ||
143 | force_personality32 |= READ_IMPLIES_EXEC; | ||
144 | return 1; | ||
145 | } | ||
146 | __setup("noexec32=", nonx32_setup); | ||
147 | |||
148 | /* | ||
149 | * NOTE: This function is marked __ref because it calls __init function | ||
150 | * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. | ||
151 | */ | ||
152 | static __ref void *spp_getpage(void) | ||
120 | { | 153 | { |
121 | void *ptr; | 154 | void *ptr; |
122 | 155 | ||
@@ -135,26 +168,17 @@ static __init void *spp_getpage(void) | |||
135 | return ptr; | 168 | return ptr; |
136 | } | 169 | } |
137 | 170 | ||
138 | static void | 171 | void |
139 | set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) | 172 | set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) |
140 | { | 173 | { |
141 | pgd_t *pgd; | ||
142 | pud_t *pud; | 174 | pud_t *pud; |
143 | pmd_t *pmd; | 175 | pmd_t *pmd; |
144 | pte_t *pte, new_pte; | 176 | pte_t *pte; |
145 | |||
146 | pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys); | ||
147 | 177 | ||
148 | pgd = pgd_offset_k(vaddr); | 178 | pud = pud_page + pud_index(vaddr); |
149 | if (pgd_none(*pgd)) { | ||
150 | printk(KERN_ERR | ||
151 | "PGD FIXMAP MISSING, it should be setup in head.S!\n"); | ||
152 | return; | ||
153 | } | ||
154 | pud = pud_offset(pgd, vaddr); | ||
155 | if (pud_none(*pud)) { | 179 | if (pud_none(*pud)) { |
156 | pmd = (pmd_t *) spp_getpage(); | 180 | pmd = (pmd_t *) spp_getpage(); |
157 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); | 181 | pud_populate(&init_mm, pud, pmd); |
158 | if (pmd != pmd_offset(pud, 0)) { | 182 | if (pmd != pmd_offset(pud, 0)) { |
159 | printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", | 183 | printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", |
160 | pmd, pmd_offset(pud, 0)); | 184 | pmd, pmd_offset(pud, 0)); |
@@ -164,13 +188,12 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) | |||
164 | pmd = pmd_offset(pud, vaddr); | 188 | pmd = pmd_offset(pud, vaddr); |
165 | if (pmd_none(*pmd)) { | 189 | if (pmd_none(*pmd)) { |
166 | pte = (pte_t *) spp_getpage(); | 190 | pte = (pte_t *) spp_getpage(); |
167 | set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); | 191 | pmd_populate_kernel(&init_mm, pmd, pte); |
168 | if (pte != pte_offset_kernel(pmd, 0)) { | 192 | if (pte != pte_offset_kernel(pmd, 0)) { |
169 | printk(KERN_ERR "PAGETABLE BUG #02!\n"); | 193 | printk(KERN_ERR "PAGETABLE BUG #02!\n"); |
170 | return; | 194 | return; |
171 | } | 195 | } |
172 | } | 196 | } |
173 | new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); | ||
174 | 197 | ||
175 | pte = pte_offset_kernel(pmd, vaddr); | 198 | pte = pte_offset_kernel(pmd, vaddr); |
176 | if (!pte_none(*pte) && pte_val(new_pte) && | 199 | if (!pte_none(*pte) && pte_val(new_pte) && |
@@ -185,6 +208,64 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) | |||
185 | __flush_tlb_one(vaddr); | 208 | __flush_tlb_one(vaddr); |
186 | } | 209 | } |
187 | 210 | ||
211 | void | ||
212 | set_pte_vaddr(unsigned long vaddr, pte_t pteval) | ||
213 | { | ||
214 | pgd_t *pgd; | ||
215 | pud_t *pud_page; | ||
216 | |||
217 | pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); | ||
218 | |||
219 | pgd = pgd_offset_k(vaddr); | ||
220 | if (pgd_none(*pgd)) { | ||
221 | printk(KERN_ERR | ||
222 | "PGD FIXMAP MISSING, it should be setup in head.S!\n"); | ||
223 | return; | ||
224 | } | ||
225 | pud_page = (pud_t*)pgd_page_vaddr(*pgd); | ||
226 | set_pte_vaddr_pud(pud_page, vaddr, pteval); | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * Create large page table mappings for a range of physical addresses. | ||
231 | */ | ||
232 | static void __init __init_extra_mapping(unsigned long phys, unsigned long size, | ||
233 | pgprot_t prot) | ||
234 | { | ||
235 | pgd_t *pgd; | ||
236 | pud_t *pud; | ||
237 | pmd_t *pmd; | ||
238 | |||
239 | BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); | ||
240 | for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { | ||
241 | pgd = pgd_offset_k((unsigned long)__va(phys)); | ||
242 | if (pgd_none(*pgd)) { | ||
243 | pud = (pud_t *) spp_getpage(); | ||
244 | set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | | ||
245 | _PAGE_USER)); | ||
246 | } | ||
247 | pud = pud_offset(pgd, (unsigned long)__va(phys)); | ||
248 | if (pud_none(*pud)) { | ||
249 | pmd = (pmd_t *) spp_getpage(); | ||
250 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | | ||
251 | _PAGE_USER)); | ||
252 | } | ||
253 | pmd = pmd_offset(pud, phys); | ||
254 | BUG_ON(!pmd_none(*pmd)); | ||
255 | set_pmd(pmd, __pmd(phys | pgprot_val(prot))); | ||
256 | } | ||
257 | } | ||
258 | |||
259 | void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) | ||
260 | { | ||
261 | __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE); | ||
262 | } | ||
263 | |||
264 | void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) | ||
265 | { | ||
266 | __init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE); | ||
267 | } | ||
268 | |||
188 | /* | 269 | /* |
189 | * The head.S code sets up the kernel high mapping: | 270 | * The head.S code sets up the kernel high mapping: |
190 | * | 271 | * |
@@ -201,7 +282,7 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) | |||
201 | void __init cleanup_highmap(void) | 282 | void __init cleanup_highmap(void) |
202 | { | 283 | { |
203 | unsigned long vaddr = __START_KERNEL_map; | 284 | unsigned long vaddr = __START_KERNEL_map; |
204 | unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; | 285 | unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; |
205 | pmd_t *pmd = level2_kernel_pgt; | 286 | pmd_t *pmd = level2_kernel_pgt; |
206 | pmd_t *last_pmd = pmd + PTRS_PER_PMD; | 287 | pmd_t *last_pmd = pmd + PTRS_PER_PMD; |
207 | 288 | ||
@@ -213,22 +294,11 @@ void __init cleanup_highmap(void) | |||
213 | } | 294 | } |
214 | } | 295 | } |
215 | 296 | ||
216 | /* NOTE: this is meant to be run only at boot */ | ||
217 | void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) | ||
218 | { | ||
219 | unsigned long address = __fix_to_virt(idx); | ||
220 | |||
221 | if (idx >= __end_of_fixed_addresses) { | ||
222 | printk(KERN_ERR "Invalid __set_fixmap\n"); | ||
223 | return; | ||
224 | } | ||
225 | set_pte_phys(address, phys, prot); | ||
226 | } | ||
227 | |||
228 | static unsigned long __initdata table_start; | 297 | static unsigned long __initdata table_start; |
229 | static unsigned long __meminitdata table_end; | 298 | static unsigned long __meminitdata table_end; |
299 | static unsigned long __meminitdata table_top; | ||
230 | 300 | ||
231 | static __meminit void *alloc_low_page(unsigned long *phys) | 301 | static __ref void *alloc_low_page(unsigned long *phys) |
232 | { | 302 | { |
233 | unsigned long pfn = table_end++; | 303 | unsigned long pfn = table_end++; |
234 | void *adr; | 304 | void *adr; |
@@ -240,7 +310,7 @@ static __meminit void *alloc_low_page(unsigned long *phys) | |||
240 | return adr; | 310 | return adr; |
241 | } | 311 | } |
242 | 312 | ||
243 | if (pfn >= end_pfn) | 313 | if (pfn >= table_top) |
244 | panic("alloc_low_page: ran out of memory"); | 314 | panic("alloc_low_page: ran out of memory"); |
245 | 315 | ||
246 | adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); | 316 | adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); |
@@ -249,7 +319,7 @@ static __meminit void *alloc_low_page(unsigned long *phys) | |||
249 | return adr; | 319 | return adr; |
250 | } | 320 | } |
251 | 321 | ||
252 | static __meminit void unmap_low_page(void *adr) | 322 | static __ref void unmap_low_page(void *adr) |
253 | { | 323 | { |
254 | if (after_bootmem) | 324 | if (after_bootmem) |
255 | return; | 325 | return; |
@@ -257,65 +327,71 @@ static __meminit void unmap_low_page(void *adr) | |||
257 | early_iounmap(adr, PAGE_SIZE); | 327 | early_iounmap(adr, PAGE_SIZE); |
258 | } | 328 | } |
259 | 329 | ||
260 | /* Must run before zap_low_mappings */ | 330 | static unsigned long __meminit |
261 | __meminit void *early_ioremap(unsigned long addr, unsigned long size) | 331 | phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, |
332 | pgprot_t prot) | ||
262 | { | 333 | { |
263 | pmd_t *pmd, *last_pmd; | 334 | unsigned pages = 0; |
264 | unsigned long vaddr; | 335 | unsigned long last_map_addr = end; |
265 | int i, pmds; | 336 | int i; |
337 | |||
338 | pte_t *pte = pte_page + pte_index(addr); | ||
266 | 339 | ||
267 | pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; | 340 | for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) { |
268 | vaddr = __START_KERNEL_map; | ||
269 | pmd = level2_kernel_pgt; | ||
270 | last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; | ||
271 | 341 | ||
272 | for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { | 342 | if (addr >= end) { |
273 | for (i = 0; i < pmds; i++) { | 343 | if (!after_bootmem) { |
274 | if (pmd_present(pmd[i])) | 344 | for(; i < PTRS_PER_PTE; i++, pte++) |
275 | goto continue_outer_loop; | 345 | set_pte(pte, __pte(0)); |
346 | } | ||
347 | break; | ||
276 | } | 348 | } |
277 | vaddr += addr & ~PMD_MASK; | ||
278 | addr &= PMD_MASK; | ||
279 | 349 | ||
280 | for (i = 0; i < pmds; i++, addr += PMD_SIZE) | 350 | /* |
281 | set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); | 351 | * We will re-use the existing mapping. |
282 | __flush_tlb_all(); | 352 | * Xen for example has some special requirements, like mapping |
353 | * pagetable pages as RO. So assume someone who pre-setup | ||
354 | * these mappings are more intelligent. | ||
355 | */ | ||
356 | if (pte_val(*pte)) | ||
357 | continue; | ||
283 | 358 | ||
284 | return (void *)vaddr; | 359 | if (0) |
285 | continue_outer_loop: | 360 | printk(" pte=%p addr=%lx pte=%016lx\n", |
286 | ; | 361 | pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); |
362 | pages++; | ||
363 | set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); | ||
364 | last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; | ||
287 | } | 365 | } |
288 | printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size); | ||
289 | 366 | ||
290 | return NULL; | 367 | update_page_count(PG_LEVEL_4K, pages); |
368 | |||
369 | return last_map_addr; | ||
291 | } | 370 | } |
292 | 371 | ||
293 | /* | 372 | static unsigned long __meminit |
294 | * To avoid virtual aliases later: | 373 | phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, |
295 | */ | 374 | pgprot_t prot) |
296 | __meminit void early_iounmap(void *addr, unsigned long size) | ||
297 | { | 375 | { |
298 | unsigned long vaddr; | 376 | pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); |
299 | pmd_t *pmd; | ||
300 | int i, pmds; | ||
301 | 377 | ||
302 | vaddr = (unsigned long)addr; | 378 | return phys_pte_init(pte, address, end, prot); |
303 | pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; | ||
304 | pmd = level2_kernel_pgt + pmd_index(vaddr); | ||
305 | |||
306 | for (i = 0; i < pmds; i++) | ||
307 | pmd_clear(pmd + i); | ||
308 | |||
309 | __flush_tlb_all(); | ||
310 | } | 379 | } |
311 | 380 | ||
312 | static unsigned long __meminit | 381 | static unsigned long __meminit |
313 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) | 382 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, |
383 | unsigned long page_size_mask, pgprot_t prot) | ||
314 | { | 384 | { |
385 | unsigned long pages = 0; | ||
386 | unsigned long last_map_addr = end; | ||
387 | |||
315 | int i = pmd_index(address); | 388 | int i = pmd_index(address); |
316 | 389 | ||
317 | for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { | 390 | for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { |
391 | unsigned long pte_phys; | ||
318 | pmd_t *pmd = pmd_page + pmd_index(address); | 392 | pmd_t *pmd = pmd_page + pmd_index(address); |
393 | pte_t *pte; | ||
394 | pgprot_t new_prot = prot; | ||
319 | 395 | ||
320 | if (address >= end) { | 396 | if (address >= end) { |
321 | if (!after_bootmem) { | 397 | if (!after_bootmem) { |
@@ -325,31 +401,71 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) | |||
325 | break; | 401 | break; |
326 | } | 402 | } |
327 | 403 | ||
328 | if (pmd_val(*pmd)) | 404 | if (pmd_val(*pmd)) { |
405 | if (!pmd_large(*pmd)) { | ||
406 | spin_lock(&init_mm.page_table_lock); | ||
407 | last_map_addr = phys_pte_update(pmd, address, | ||
408 | end, prot); | ||
409 | spin_unlock(&init_mm.page_table_lock); | ||
410 | continue; | ||
411 | } | ||
412 | /* | ||
413 | * If we are ok with PG_LEVEL_2M mapping, then we will | ||
414 | * use the existing mapping, | ||
415 | * | ||
416 | * Otherwise, we will split the large page mapping but | ||
417 | * use the same existing protection bits except for | ||
418 | * large page, so that we don't violate Intel's TLB | ||
419 | * Application note (317080) which says, while changing | ||
420 | * the page sizes, new and old translations should | ||
421 | * not differ with respect to page frame and | ||
422 | * attributes. | ||
423 | */ | ||
424 | if (page_size_mask & (1 << PG_LEVEL_2M)) | ||
425 | continue; | ||
426 | new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); | ||
427 | } | ||
428 | |||
429 | if (page_size_mask & (1<<PG_LEVEL_2M)) { | ||
430 | pages++; | ||
431 | spin_lock(&init_mm.page_table_lock); | ||
432 | set_pte((pte_t *)pmd, | ||
433 | pfn_pte(address >> PAGE_SHIFT, | ||
434 | __pgprot(pgprot_val(prot) | _PAGE_PSE))); | ||
435 | spin_unlock(&init_mm.page_table_lock); | ||
436 | last_map_addr = (address & PMD_MASK) + PMD_SIZE; | ||
329 | continue; | 437 | continue; |
438 | } | ||
330 | 439 | ||
331 | set_pte((pte_t *)pmd, | 440 | pte = alloc_low_page(&pte_phys); |
332 | pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); | 441 | last_map_addr = phys_pte_init(pte, address, end, new_prot); |
442 | unmap_low_page(pte); | ||
443 | |||
444 | spin_lock(&init_mm.page_table_lock); | ||
445 | pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); | ||
446 | spin_unlock(&init_mm.page_table_lock); | ||
333 | } | 447 | } |
334 | return address; | 448 | update_page_count(PG_LEVEL_2M, pages); |
449 | return last_map_addr; | ||
335 | } | 450 | } |
336 | 451 | ||
337 | static unsigned long __meminit | 452 | static unsigned long __meminit |
338 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) | 453 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, |
454 | unsigned long page_size_mask, pgprot_t prot) | ||
339 | { | 455 | { |
340 | pmd_t *pmd = pmd_offset(pud, 0); | 456 | pmd_t *pmd = pmd_offset(pud, 0); |
341 | unsigned long last_map_addr; | 457 | unsigned long last_map_addr; |
342 | 458 | ||
343 | spin_lock(&init_mm.page_table_lock); | 459 | last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); |
344 | last_map_addr = phys_pmd_init(pmd, address, end); | ||
345 | spin_unlock(&init_mm.page_table_lock); | ||
346 | __flush_tlb_all(); | 460 | __flush_tlb_all(); |
347 | return last_map_addr; | 461 | return last_map_addr; |
348 | } | 462 | } |
349 | 463 | ||
350 | static unsigned long __meminit | 464 | static unsigned long __meminit |
351 | phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) | 465 | phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, |
466 | unsigned long page_size_mask) | ||
352 | { | 467 | { |
468 | unsigned long pages = 0; | ||
353 | unsigned long last_map_addr = end; | 469 | unsigned long last_map_addr = end; |
354 | int i = pud_index(addr); | 470 | int i = pud_index(addr); |
355 | 471 | ||
@@ -357,6 +473,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) | |||
357 | unsigned long pmd_phys; | 473 | unsigned long pmd_phys; |
358 | pud_t *pud = pud_page + pud_index(addr); | 474 | pud_t *pud = pud_page + pud_index(addr); |
359 | pmd_t *pmd; | 475 | pmd_t *pmd; |
476 | pgprot_t prot = PAGE_KERNEL; | ||
360 | 477 | ||
361 | if (addr >= end) | 478 | if (addr >= end) |
362 | break; | 479 | break; |
@@ -368,42 +485,87 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) | |||
368 | } | 485 | } |
369 | 486 | ||
370 | if (pud_val(*pud)) { | 487 | if (pud_val(*pud)) { |
371 | if (!pud_large(*pud)) | 488 | if (!pud_large(*pud)) { |
372 | last_map_addr = phys_pmd_update(pud, addr, end); | 489 | last_map_addr = phys_pmd_update(pud, addr, end, |
373 | continue; | 490 | page_size_mask, prot); |
491 | continue; | ||
492 | } | ||
493 | /* | ||
494 | * If we are ok with PG_LEVEL_1G mapping, then we will | ||
495 | * use the existing mapping. | ||
496 | * | ||
497 | * Otherwise, we will split the gbpage mapping but use | ||
498 | * the same existing protection bits except for large | ||
499 | * page, so that we don't violate Intel's TLB | ||
500 | * Application note (317080) which says, while changing | ||
501 | * the page sizes, new and old translations should | ||
502 | * not differ with respect to page frame and | ||
503 | * attributes. | ||
504 | */ | ||
505 | if (page_size_mask & (1 << PG_LEVEL_1G)) | ||
506 | continue; | ||
507 | prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); | ||
374 | } | 508 | } |
375 | 509 | ||
376 | if (direct_gbpages) { | 510 | if (page_size_mask & (1<<PG_LEVEL_1G)) { |
511 | pages++; | ||
512 | spin_lock(&init_mm.page_table_lock); | ||
377 | set_pte((pte_t *)pud, | 513 | set_pte((pte_t *)pud, |
378 | pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); | 514 | pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); |
515 | spin_unlock(&init_mm.page_table_lock); | ||
379 | last_map_addr = (addr & PUD_MASK) + PUD_SIZE; | 516 | last_map_addr = (addr & PUD_MASK) + PUD_SIZE; |
380 | continue; | 517 | continue; |
381 | } | 518 | } |
382 | 519 | ||
383 | pmd = alloc_low_page(&pmd_phys); | 520 | pmd = alloc_low_page(&pmd_phys); |
521 | last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, | ||
522 | prot); | ||
523 | unmap_low_page(pmd); | ||
384 | 524 | ||
385 | spin_lock(&init_mm.page_table_lock); | 525 | spin_lock(&init_mm.page_table_lock); |
386 | set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); | 526 | pud_populate(&init_mm, pud, __va(pmd_phys)); |
387 | last_map_addr = phys_pmd_init(pmd, addr, end); | ||
388 | spin_unlock(&init_mm.page_table_lock); | 527 | spin_unlock(&init_mm.page_table_lock); |
389 | |||
390 | unmap_low_page(pmd); | ||
391 | } | 528 | } |
392 | __flush_tlb_all(); | 529 | __flush_tlb_all(); |
393 | 530 | ||
394 | return last_map_addr >> PAGE_SHIFT; | 531 | update_page_count(PG_LEVEL_1G, pages); |
532 | |||
533 | return last_map_addr; | ||
395 | } | 534 | } |
396 | 535 | ||
397 | static void __init find_early_table_space(unsigned long end) | 536 | static unsigned long __meminit |
537 | phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, | ||
538 | unsigned long page_size_mask) | ||
398 | { | 539 | { |
399 | unsigned long puds, pmds, tables, start; | 540 | pud_t *pud; |
541 | |||
542 | pud = (pud_t *)pgd_page_vaddr(*pgd); | ||
543 | |||
544 | return phys_pud_init(pud, addr, end, page_size_mask); | ||
545 | } | ||
546 | |||
547 | static void __init find_early_table_space(unsigned long end, int use_pse, | ||
548 | int use_gbpages) | ||
549 | { | ||
550 | unsigned long puds, pmds, ptes, tables, start; | ||
400 | 551 | ||
401 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | 552 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; |
402 | tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); | 553 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); |
403 | if (!direct_gbpages) { | 554 | if (use_gbpages) { |
555 | unsigned long extra; | ||
556 | extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); | ||
557 | pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; | ||
558 | } else | ||
404 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; | 559 | pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; |
405 | tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); | 560 | tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); |
406 | } | 561 | |
562 | if (use_pse) { | ||
563 | unsigned long extra; | ||
564 | extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); | ||
565 | ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
566 | } else | ||
567 | ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
568 | tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); | ||
407 | 569 | ||
408 | /* | 570 | /* |
409 | * RED-PEN putting page tables only on node 0 could | 571 | * RED-PEN putting page tables only on node 0 could |
@@ -417,10 +579,10 @@ static void __init find_early_table_space(unsigned long end) | |||
417 | 579 | ||
418 | table_start >>= PAGE_SHIFT; | 580 | table_start >>= PAGE_SHIFT; |
419 | table_end = table_start; | 581 | table_end = table_start; |
582 | table_top = table_start + (tables >> PAGE_SHIFT); | ||
420 | 583 | ||
421 | early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n", | 584 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", |
422 | end, table_start << PAGE_SHIFT, | 585 | end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT); |
423 | (table_start << PAGE_SHIFT) + tables); | ||
424 | } | 586 | } |
425 | 587 | ||
426 | static void __init init_gbpages(void) | 588 | static void __init init_gbpages(void) |
@@ -431,125 +593,85 @@ static void __init init_gbpages(void) | |||
431 | direct_gbpages = 0; | 593 | direct_gbpages = 0; |
432 | } | 594 | } |
433 | 595 | ||
434 | #ifdef CONFIG_MEMTEST_BOOTPARAM | 596 | static unsigned long __init kernel_physical_mapping_init(unsigned long start, |
435 | 597 | unsigned long end, | |
436 | static void __init memtest(unsigned long start_phys, unsigned long size, | 598 | unsigned long page_size_mask) |
437 | unsigned pattern) | 599 | { |
438 | { | ||
439 | unsigned long i; | ||
440 | unsigned long *start; | ||
441 | unsigned long start_bad; | ||
442 | unsigned long last_bad; | ||
443 | unsigned long val; | ||
444 | unsigned long start_phys_aligned; | ||
445 | unsigned long count; | ||
446 | unsigned long incr; | ||
447 | |||
448 | switch (pattern) { | ||
449 | case 0: | ||
450 | val = 0UL; | ||
451 | break; | ||
452 | case 1: | ||
453 | val = -1UL; | ||
454 | break; | ||
455 | case 2: | ||
456 | val = 0x5555555555555555UL; | ||
457 | break; | ||
458 | case 3: | ||
459 | val = 0xaaaaaaaaaaaaaaaaUL; | ||
460 | break; | ||
461 | default: | ||
462 | return; | ||
463 | } | ||
464 | 600 | ||
465 | incr = sizeof(unsigned long); | 601 | unsigned long next, last_map_addr = end; |
466 | start_phys_aligned = ALIGN(start_phys, incr); | ||
467 | count = (size - (start_phys_aligned - start_phys))/incr; | ||
468 | start = __va(start_phys_aligned); | ||
469 | start_bad = 0; | ||
470 | last_bad = 0; | ||
471 | |||
472 | for (i = 0; i < count; i++) | ||
473 | start[i] = val; | ||
474 | for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { | ||
475 | if (*start != val) { | ||
476 | if (start_phys_aligned == last_bad + incr) { | ||
477 | last_bad += incr; | ||
478 | } else { | ||
479 | if (start_bad) { | ||
480 | printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved", | ||
481 | val, start_bad, last_bad + incr); | ||
482 | reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); | ||
483 | } | ||
484 | start_bad = last_bad = start_phys_aligned; | ||
485 | } | ||
486 | } | ||
487 | } | ||
488 | if (start_bad) { | ||
489 | printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved", | ||
490 | val, start_bad, last_bad + incr); | ||
491 | reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); | ||
492 | } | ||
493 | 602 | ||
494 | } | 603 | start = (unsigned long)__va(start); |
604 | end = (unsigned long)__va(end); | ||
495 | 605 | ||
496 | static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE; | 606 | for (; start < end; start = next) { |
607 | pgd_t *pgd = pgd_offset_k(start); | ||
608 | unsigned long pud_phys; | ||
609 | pud_t *pud; | ||
497 | 610 | ||
498 | static int __init parse_memtest(char *arg) | 611 | next = (start + PGDIR_SIZE) & PGDIR_MASK; |
499 | { | 612 | if (next > end) |
500 | if (arg) | 613 | next = end; |
501 | memtest_pattern = simple_strtoul(arg, NULL, 0); | ||
502 | return 0; | ||
503 | } | ||
504 | 614 | ||
505 | early_param("memtest", parse_memtest); | 615 | if (pgd_val(*pgd)) { |
616 | last_map_addr = phys_pud_update(pgd, __pa(start), | ||
617 | __pa(end), page_size_mask); | ||
618 | continue; | ||
619 | } | ||
506 | 620 | ||
507 | static void __init early_memtest(unsigned long start, unsigned long end) | 621 | pud = alloc_low_page(&pud_phys); |
508 | { | 622 | last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), |
509 | u64 t_start, t_size; | 623 | page_size_mask); |
510 | unsigned pattern; | 624 | unmap_low_page(pud); |
511 | 625 | ||
512 | if (!memtest_pattern) | 626 | spin_lock(&init_mm.page_table_lock); |
513 | return; | 627 | pgd_populate(&init_mm, pgd, __va(pud_phys)); |
628 | spin_unlock(&init_mm.page_table_lock); | ||
629 | } | ||
630 | __flush_tlb_all(); | ||
514 | 631 | ||
515 | printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); | 632 | return last_map_addr; |
516 | for (pattern = 0; pattern < memtest_pattern; pattern++) { | 633 | } |
517 | t_start = start; | ||
518 | t_size = 0; | ||
519 | while (t_start < end) { | ||
520 | t_start = find_e820_area_size(t_start, &t_size, 1); | ||
521 | 634 | ||
522 | /* done ? */ | 635 | struct map_range { |
523 | if (t_start >= end) | 636 | unsigned long start; |
524 | break; | 637 | unsigned long end; |
525 | if (t_start + t_size > end) | 638 | unsigned page_size_mask; |
526 | t_size = end - t_start; | 639 | }; |
527 | 640 | ||
528 | printk(KERN_CONT "\n %016llx - %016llx pattern %d", | 641 | #define NR_RANGE_MR 5 |
529 | t_start, t_start + t_size, pattern); | ||
530 | 642 | ||
531 | memtest(t_start, t_size, pattern); | 643 | static int save_mr(struct map_range *mr, int nr_range, |
644 | unsigned long start_pfn, unsigned long end_pfn, | ||
645 | unsigned long page_size_mask) | ||
646 | { | ||
532 | 647 | ||
533 | t_start += t_size; | 648 | if (start_pfn < end_pfn) { |
534 | } | 649 | if (nr_range >= NR_RANGE_MR) |
650 | panic("run out of range for init_memory_mapping\n"); | ||
651 | mr[nr_range].start = start_pfn<<PAGE_SHIFT; | ||
652 | mr[nr_range].end = end_pfn<<PAGE_SHIFT; | ||
653 | mr[nr_range].page_size_mask = page_size_mask; | ||
654 | nr_range++; | ||
535 | } | 655 | } |
536 | printk(KERN_CONT "\n"); | 656 | |
537 | } | 657 | return nr_range; |
538 | #else | ||
539 | static void __init early_memtest(unsigned long start, unsigned long end) | ||
540 | { | ||
541 | } | 658 | } |
542 | #endif | ||
543 | 659 | ||
544 | /* | 660 | /* |
545 | * Setup the direct mapping of the physical memory at PAGE_OFFSET. | 661 | * Setup the direct mapping of the physical memory at PAGE_OFFSET. |
546 | * This runs before bootmem is initialized and gets pages directly from | 662 | * This runs before bootmem is initialized and gets pages directly from |
547 | * the physical memory. To access them they are temporarily mapped. | 663 | * the physical memory. To access them they are temporarily mapped. |
548 | */ | 664 | */ |
549 | unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end) | 665 | unsigned long __init_refok init_memory_mapping(unsigned long start, |
666 | unsigned long end) | ||
550 | { | 667 | { |
551 | unsigned long next, last_map_addr = end; | 668 | unsigned long last_map_addr = 0; |
552 | unsigned long start_phys = start, end_phys = end; | 669 | unsigned long page_size_mask = 0; |
670 | unsigned long start_pfn, end_pfn; | ||
671 | |||
672 | struct map_range mr[NR_RANGE_MR]; | ||
673 | int nr_range, i; | ||
674 | int use_pse, use_gbpages; | ||
553 | 675 | ||
554 | printk(KERN_INFO "init_memory_mapping\n"); | 676 | printk(KERN_INFO "init_memory_mapping\n"); |
555 | 677 | ||
@@ -560,48 +682,127 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned lon | |||
560 | * memory mapped. Unfortunately this is done currently before the | 682 | * memory mapped. Unfortunately this is done currently before the |
561 | * nodes are discovered. | 683 | * nodes are discovered. |
562 | */ | 684 | */ |
563 | if (!after_bootmem) { | 685 | if (!after_bootmem) |
564 | init_gbpages(); | 686 | init_gbpages(); |
565 | find_early_table_space(end); | ||
566 | } | ||
567 | 687 | ||
568 | start = (unsigned long)__va(start); | 688 | #ifdef CONFIG_DEBUG_PAGEALLOC |
569 | end = (unsigned long)__va(end); | 689 | /* |
690 | * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. | ||
691 | * This will simplify cpa(), which otherwise needs to support splitting | ||
692 | * large pages into small in interrupt context, etc. | ||
693 | */ | ||
694 | use_pse = use_gbpages = 0; | ||
695 | #else | ||
696 | use_pse = cpu_has_pse; | ||
697 | use_gbpages = direct_gbpages; | ||
698 | #endif | ||
570 | 699 | ||
571 | for (; start < end; start = next) { | 700 | if (use_gbpages) |
572 | pgd_t *pgd = pgd_offset_k(start); | 701 | page_size_mask |= 1 << PG_LEVEL_1G; |
573 | unsigned long pud_phys; | 702 | if (use_pse) |
574 | pud_t *pud; | 703 | page_size_mask |= 1 << PG_LEVEL_2M; |
704 | |||
705 | memset(mr, 0, sizeof(mr)); | ||
706 | nr_range = 0; | ||
707 | |||
708 | /* head if not big page alignment ?*/ | ||
709 | start_pfn = start >> PAGE_SHIFT; | ||
710 | end_pfn = ((start + (PMD_SIZE - 1)) >> PMD_SHIFT) | ||
711 | << (PMD_SHIFT - PAGE_SHIFT); | ||
712 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); | ||
713 | |||
714 | /* big page (2M) range*/ | ||
715 | start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT) | ||
716 | << (PMD_SHIFT - PAGE_SHIFT); | ||
717 | end_pfn = ((start + (PUD_SIZE - 1))>>PUD_SHIFT) | ||
718 | << (PUD_SHIFT - PAGE_SHIFT); | ||
719 | if (end_pfn > ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT))) | ||
720 | end_pfn = ((end>>PUD_SHIFT)<<(PUD_SHIFT - PAGE_SHIFT)); | ||
721 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
722 | page_size_mask & (1<<PG_LEVEL_2M)); | ||
723 | |||
724 | /* big page (1G) range */ | ||
725 | start_pfn = end_pfn; | ||
726 | end_pfn = (end>>PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT); | ||
727 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
728 | page_size_mask & | ||
729 | ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G))); | ||
730 | |||
731 | /* tail is not big page (1G) alignment */ | ||
732 | start_pfn = end_pfn; | ||
733 | end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); | ||
734 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, | ||
735 | page_size_mask & (1<<PG_LEVEL_2M)); | ||
736 | |||
737 | /* tail is not big page (2M) alignment */ | ||
738 | start_pfn = end_pfn; | ||
739 | end_pfn = end>>PAGE_SHIFT; | ||
740 | nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0); | ||
741 | |||
742 | /* try to merge same page size and continuous */ | ||
743 | for (i = 0; nr_range > 1 && i < nr_range - 1; i++) { | ||
744 | unsigned long old_start; | ||
745 | if (mr[i].end != mr[i+1].start || | ||
746 | mr[i].page_size_mask != mr[i+1].page_size_mask) | ||
747 | continue; | ||
748 | /* move it */ | ||
749 | old_start = mr[i].start; | ||
750 | memmove(&mr[i], &mr[i+1], | ||
751 | (nr_range - 1 - i) * sizeof (struct map_range)); | ||
752 | mr[i].start = old_start; | ||
753 | nr_range--; | ||
754 | } | ||
575 | 755 | ||
576 | if (after_bootmem) | 756 | for (i = 0; i < nr_range; i++) |
577 | pud = pud_offset(pgd, start & PGDIR_MASK); | 757 | printk(KERN_DEBUG " %010lx - %010lx page %s\n", |
578 | else | 758 | mr[i].start, mr[i].end, |
579 | pud = alloc_low_page(&pud_phys); | 759 | (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":( |
760 | (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); | ||
580 | 761 | ||
581 | next = start + PGDIR_SIZE; | 762 | if (!after_bootmem) |
582 | if (next > end) | 763 | find_early_table_space(end, use_pse, use_gbpages); |
583 | next = end; | 764 | |
584 | last_map_addr = phys_pud_init(pud, __pa(start), __pa(next)); | 765 | for (i = 0; i < nr_range; i++) |
585 | if (!after_bootmem) | 766 | last_map_addr = kernel_physical_mapping_init( |
586 | set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); | 767 | mr[i].start, mr[i].end, |
587 | unmap_low_page(pud); | 768 | mr[i].page_size_mask); |
588 | } | ||
589 | 769 | ||
590 | if (!after_bootmem) | 770 | if (!after_bootmem) |
591 | mmu_cr4_features = read_cr4(); | 771 | mmu_cr4_features = read_cr4(); |
592 | __flush_tlb_all(); | 772 | __flush_tlb_all(); |
593 | 773 | ||
594 | if (!after_bootmem) | 774 | if (!after_bootmem && table_end > table_start) |
595 | reserve_early(table_start << PAGE_SHIFT, | 775 | reserve_early(table_start << PAGE_SHIFT, |
596 | table_end << PAGE_SHIFT, "PGTABLE"); | 776 | table_end << PAGE_SHIFT, "PGTABLE"); |
597 | 777 | ||
778 | printk(KERN_INFO "last_map_addr: %lx end: %lx\n", | ||
779 | last_map_addr, end); | ||
780 | |||
598 | if (!after_bootmem) | 781 | if (!after_bootmem) |
599 | early_memtest(start_phys, end_phys); | 782 | early_memtest(start, end); |
600 | 783 | ||
601 | return last_map_addr; | 784 | return last_map_addr >> PAGE_SHIFT; |
602 | } | 785 | } |
603 | 786 | ||
604 | #ifndef CONFIG_NUMA | 787 | #ifndef CONFIG_NUMA |
788 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) | ||
789 | { | ||
790 | unsigned long bootmap_size, bootmap; | ||
791 | |||
792 | bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; | ||
793 | bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size, | ||
794 | PAGE_SIZE); | ||
795 | if (bootmap == -1L) | ||
796 | panic("Cannot find bootmem map of size %ld\n", bootmap_size); | ||
797 | /* don't touch min_low_pfn */ | ||
798 | bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, | ||
799 | 0, end_pfn); | ||
800 | e820_register_active_regions(0, start_pfn, end_pfn); | ||
801 | free_bootmem_with_active_regions(0, end_pfn); | ||
802 | early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); | ||
803 | reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); | ||
804 | } | ||
805 | |||
605 | void __init paging_init(void) | 806 | void __init paging_init(void) |
606 | { | 807 | { |
607 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | 808 | unsigned long max_zone_pfns[MAX_NR_ZONES]; |
@@ -609,9 +810,9 @@ void __init paging_init(void) | |||
609 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | 810 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); |
610 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; | 811 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; |
611 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; | 812 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; |
612 | max_zone_pfns[ZONE_NORMAL] = end_pfn; | 813 | max_zone_pfns[ZONE_NORMAL] = max_pfn; |
613 | 814 | ||
614 | memory_present(0, 0, end_pfn); | 815 | memory_present(0, 0, max_pfn); |
615 | sparse_init(); | 816 | sparse_init(); |
616 | free_area_init_nodes(max_zone_pfns); | 817 | free_area_init_nodes(max_zone_pfns); |
617 | } | 818 | } |
@@ -681,6 +882,8 @@ void __init mem_init(void) | |||
681 | { | 882 | { |
682 | long codesize, reservedpages, datasize, initsize; | 883 | long codesize, reservedpages, datasize, initsize; |
683 | 884 | ||
885 | start_periodic_check_for_corruption(); | ||
886 | |||
684 | pci_iommu_alloc(); | 887 | pci_iommu_alloc(); |
685 | 888 | ||
686 | /* clear_bss() already clear the empty_zero_page */ | 889 | /* clear_bss() already clear the empty_zero_page */ |
@@ -693,8 +896,8 @@ void __init mem_init(void) | |||
693 | #else | 896 | #else |
694 | totalram_pages = free_all_bootmem(); | 897 | totalram_pages = free_all_bootmem(); |
695 | #endif | 898 | #endif |
696 | reservedpages = end_pfn - totalram_pages - | 899 | reservedpages = max_pfn - totalram_pages - |
697 | absent_pages_in_range(0, end_pfn); | 900 | absent_pages_in_range(0, max_pfn); |
698 | after_bootmem = 1; | 901 | after_bootmem = 1; |
699 | 902 | ||
700 | codesize = (unsigned long) &_etext - (unsigned long) &_text; | 903 | codesize = (unsigned long) &_etext - (unsigned long) &_text; |
@@ -713,13 +916,11 @@ void __init mem_init(void) | |||
713 | printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " | 916 | printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " |
714 | "%ldk reserved, %ldk data, %ldk init)\n", | 917 | "%ldk reserved, %ldk data, %ldk init)\n", |
715 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), | 918 | (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), |
716 | end_pfn << (PAGE_SHIFT-10), | 919 | max_pfn << (PAGE_SHIFT-10), |
717 | codesize >> 10, | 920 | codesize >> 10, |
718 | reservedpages << (PAGE_SHIFT-10), | 921 | reservedpages << (PAGE_SHIFT-10), |
719 | datasize >> 10, | 922 | datasize >> 10, |
720 | initsize >> 10); | 923 | initsize >> 10); |
721 | |||
722 | cpa_init(); | ||
723 | } | 924 | } |
724 | 925 | ||
725 | void free_init_pages(char *what, unsigned long begin, unsigned long end) | 926 | void free_init_pages(char *what, unsigned long begin, unsigned long end) |
@@ -766,6 +967,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data); | |||
766 | void mark_rodata_ro(void) | 967 | void mark_rodata_ro(void) |
767 | { | 968 | { |
768 | unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); | 969 | unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); |
970 | unsigned long rodata_start = | ||
971 | ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; | ||
972 | |||
973 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
974 | /* Dynamic tracing modifies the kernel text section */ | ||
975 | start = rodata_start; | ||
976 | #endif | ||
769 | 977 | ||
770 | printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", | 978 | printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", |
771 | (end - start) >> 10); | 979 | (end - start) >> 10); |
@@ -775,8 +983,7 @@ void mark_rodata_ro(void) | |||
775 | * The rodata section (but not the kernel text!) should also be | 983 | * The rodata section (but not the kernel text!) should also be |
776 | * not-executable. | 984 | * not-executable. |
777 | */ | 985 | */ |
778 | start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; | 986 | set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT); |
779 | set_memory_nx(start, (end - start) >> PAGE_SHIFT); | ||
780 | 987 | ||
781 | rodata_test(); | 988 | rodata_test(); |
782 | 989 | ||
@@ -798,24 +1005,26 @@ void free_initrd_mem(unsigned long start, unsigned long end) | |||
798 | } | 1005 | } |
799 | #endif | 1006 | #endif |
800 | 1007 | ||
801 | void __init reserve_bootmem_generic(unsigned long phys, unsigned len) | 1008 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, |
1009 | int flags) | ||
802 | { | 1010 | { |
803 | #ifdef CONFIG_NUMA | 1011 | #ifdef CONFIG_NUMA |
804 | int nid, next_nid; | 1012 | int nid, next_nid; |
1013 | int ret; | ||
805 | #endif | 1014 | #endif |
806 | unsigned long pfn = phys >> PAGE_SHIFT; | 1015 | unsigned long pfn = phys >> PAGE_SHIFT; |
807 | 1016 | ||
808 | if (pfn >= end_pfn) { | 1017 | if (pfn >= max_pfn) { |
809 | /* | 1018 | /* |
810 | * This can happen with kdump kernels when accessing | 1019 | * This can happen with kdump kernels when accessing |
811 | * firmware tables: | 1020 | * firmware tables: |
812 | */ | 1021 | */ |
813 | if (pfn < max_pfn_mapped) | 1022 | if (pfn < max_pfn_mapped) |
814 | return; | 1023 | return -EFAULT; |
815 | 1024 | ||
816 | printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", | 1025 | printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n", |
817 | phys, len); | 1026 | phys, len); |
818 | return; | 1027 | return -EFAULT; |
819 | } | 1028 | } |
820 | 1029 | ||
821 | /* Should check here against the e820 map to avoid double free */ | 1030 | /* Should check here against the e820 map to avoid double free */ |
@@ -823,9 +1032,13 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) | |||
823 | nid = phys_to_nid(phys); | 1032 | nid = phys_to_nid(phys); |
824 | next_nid = phys_to_nid(phys + len - 1); | 1033 | next_nid = phys_to_nid(phys + len - 1); |
825 | if (nid == next_nid) | 1034 | if (nid == next_nid) |
826 | reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); | 1035 | ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags); |
827 | else | 1036 | else |
828 | reserve_bootmem(phys, len, BOOTMEM_DEFAULT); | 1037 | ret = reserve_bootmem(phys, len, flags); |
1038 | |||
1039 | if (ret != 0) | ||
1040 | return ret; | ||
1041 | |||
829 | #else | 1042 | #else |
830 | reserve_bootmem(phys, len, BOOTMEM_DEFAULT); | 1043 | reserve_bootmem(phys, len, BOOTMEM_DEFAULT); |
831 | #endif | 1044 | #endif |
@@ -834,6 +1047,8 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) | |||
834 | dma_reserve += len / PAGE_SIZE; | 1047 | dma_reserve += len / PAGE_SIZE; |
835 | set_dma_reserve(dma_reserve); | 1048 | set_dma_reserve(dma_reserve); |
836 | } | 1049 | } |
1050 | |||
1051 | return 0; | ||
837 | } | 1052 | } |
838 | 1053 | ||
839 | int kern_addr_valid(unsigned long addr) | 1054 | int kern_addr_valid(unsigned long addr) |
@@ -938,7 +1153,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) | |||
938 | pmd_t *pmd; | 1153 | pmd_t *pmd; |
939 | 1154 | ||
940 | for (; addr < end; addr = next) { | 1155 | for (; addr < end; addr = next) { |
941 | next = pmd_addr_end(addr, end); | 1156 | void *p = NULL; |
942 | 1157 | ||
943 | pgd = vmemmap_pgd_populate(addr, node); | 1158 | pgd = vmemmap_pgd_populate(addr, node); |
944 | if (!pgd) | 1159 | if (!pgd) |
@@ -948,33 +1163,51 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) | |||
948 | if (!pud) | 1163 | if (!pud) |
949 | return -ENOMEM; | 1164 | return -ENOMEM; |
950 | 1165 | ||
951 | pmd = pmd_offset(pud, addr); | 1166 | if (!cpu_has_pse) { |
952 | if (pmd_none(*pmd)) { | 1167 | next = (addr + PAGE_SIZE) & PAGE_MASK; |
953 | pte_t entry; | 1168 | pmd = vmemmap_pmd_populate(pud, addr, node); |
954 | void *p; | 1169 | |
1170 | if (!pmd) | ||
1171 | return -ENOMEM; | ||
1172 | |||
1173 | p = vmemmap_pte_populate(pmd, addr, node); | ||
955 | 1174 | ||
956 | p = vmemmap_alloc_block(PMD_SIZE, node); | ||
957 | if (!p) | 1175 | if (!p) |
958 | return -ENOMEM; | 1176 | return -ENOMEM; |
959 | 1177 | ||
960 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, | 1178 | addr_end = addr + PAGE_SIZE; |
961 | PAGE_KERNEL_LARGE); | 1179 | p_end = p + PAGE_SIZE; |
962 | set_pmd(pmd, __pmd(pte_val(entry))); | ||
963 | |||
964 | /* check to see if we have contiguous blocks */ | ||
965 | if (p_end != p || node_start != node) { | ||
966 | if (p_start) | ||
967 | printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", | ||
968 | addr_start, addr_end-1, p_start, p_end-1, node_start); | ||
969 | addr_start = addr; | ||
970 | node_start = node; | ||
971 | p_start = p; | ||
972 | } | ||
973 | addr_end = addr + PMD_SIZE; | ||
974 | p_end = p + PMD_SIZE; | ||
975 | } else { | 1180 | } else { |
976 | vmemmap_verify((pte_t *)pmd, node, addr, next); | 1181 | next = pmd_addr_end(addr, end); |
1182 | |||
1183 | pmd = pmd_offset(pud, addr); | ||
1184 | if (pmd_none(*pmd)) { | ||
1185 | pte_t entry; | ||
1186 | |||
1187 | p = vmemmap_alloc_block(PMD_SIZE, node); | ||
1188 | if (!p) | ||
1189 | return -ENOMEM; | ||
1190 | |||
1191 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, | ||
1192 | PAGE_KERNEL_LARGE); | ||
1193 | set_pmd(pmd, __pmd(pte_val(entry))); | ||
1194 | |||
1195 | /* check to see if we have contiguous blocks */ | ||
1196 | if (p_end != p || node_start != node) { | ||
1197 | if (p_start) | ||
1198 | printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", | ||
1199 | addr_start, addr_end-1, p_start, p_end-1, node_start); | ||
1200 | addr_start = addr; | ||
1201 | node_start = node; | ||
1202 | p_start = p; | ||
1203 | } | ||
1204 | |||
1205 | addr_end = addr + PMD_SIZE; | ||
1206 | p_end = p + PMD_SIZE; | ||
1207 | } else | ||
1208 | vmemmap_verify((pte_t *)pmd, node, addr, next); | ||
977 | } | 1209 | } |
1210 | |||
978 | } | 1211 | } |
979 | return 0; | 1212 | return 0; |
980 | } | 1213 | } |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 9dd3cb905971..8cbeda15cd29 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/vmalloc.h> | 14 | #include <linux/vmalloc.h> |
15 | #include <linux/mmiotrace.h> | ||
15 | 16 | ||
16 | #include <asm/cacheflush.h> | 17 | #include <asm/cacheflush.h> |
17 | #include <asm/e820.h> | 18 | #include <asm/e820.h> |
@@ -101,6 +102,25 @@ int page_is_ram(unsigned long pagenr) | |||
101 | return 0; | 102 | return 0; |
102 | } | 103 | } |
103 | 104 | ||
105 | int pagerange_is_ram(unsigned long start, unsigned long end) | ||
106 | { | ||
107 | int ram_page = 0, not_rampage = 0; | ||
108 | unsigned long page_nr; | ||
109 | |||
110 | for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); | ||
111 | ++page_nr) { | ||
112 | if (page_is_ram(page_nr)) | ||
113 | ram_page = 1; | ||
114 | else | ||
115 | not_rampage = 1; | ||
116 | |||
117 | if (ram_page == not_rampage) | ||
118 | return -1; | ||
119 | } | ||
120 | |||
121 | return ram_page; | ||
122 | } | ||
123 | |||
104 | /* | 124 | /* |
105 | * Fix up the linear direct mapping of the kernel to avoid cache attribute | 125 | * Fix up the linear direct mapping of the kernel to avoid cache attribute |
106 | * conflicts. | 126 | * conflicts. |
@@ -141,10 +161,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
141 | { | 161 | { |
142 | unsigned long pfn, offset, vaddr; | 162 | unsigned long pfn, offset, vaddr; |
143 | resource_size_t last_addr; | 163 | resource_size_t last_addr; |
164 | const resource_size_t unaligned_phys_addr = phys_addr; | ||
165 | const unsigned long unaligned_size = size; | ||
144 | struct vm_struct *area; | 166 | struct vm_struct *area; |
145 | unsigned long new_prot_val; | 167 | unsigned long new_prot_val; |
146 | pgprot_t prot; | 168 | pgprot_t prot; |
147 | int retval; | 169 | int retval; |
170 | void __iomem *ret_addr; | ||
148 | 171 | ||
149 | /* Don't allow wraparound or zero size */ | 172 | /* Don't allow wraparound or zero size */ |
150 | last_addr = phys_addr + size - 1; | 173 | last_addr = phys_addr + size - 1; |
@@ -161,7 +184,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
161 | /* | 184 | /* |
162 | * Don't remap the low PCI/ISA area, it's always mapped.. | 185 | * Don't remap the low PCI/ISA area, it's always mapped.. |
163 | */ | 186 | */ |
164 | if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) | 187 | if (is_ISA_range(phys_addr, last_addr)) |
165 | return (__force void __iomem *)phys_to_virt(phys_addr); | 188 | return (__force void __iomem *)phys_to_virt(phys_addr); |
166 | 189 | ||
167 | /* | 190 | /* |
@@ -185,7 +208,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
185 | phys_addr &= PAGE_MASK; | 208 | phys_addr &= PAGE_MASK; |
186 | size = PAGE_ALIGN(last_addr+1) - phys_addr; | 209 | size = PAGE_ALIGN(last_addr+1) - phys_addr; |
187 | 210 | ||
188 | retval = reserve_memtype(phys_addr, phys_addr + size, | 211 | retval = reserve_memtype(phys_addr, (u64)phys_addr + size, |
189 | prot_val, &new_prot_val); | 212 | prot_val, &new_prot_val); |
190 | if (retval) { | 213 | if (retval) { |
191 | pr_debug("Warning: reserve_memtype returned %d\n", retval); | 214 | pr_debug("Warning: reserve_memtype returned %d\n", retval); |
@@ -252,7 +275,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
252 | return NULL; | 275 | return NULL; |
253 | } | 276 | } |
254 | 277 | ||
255 | return (void __iomem *) (vaddr + offset); | 278 | ret_addr = (void __iomem *) (vaddr + offset); |
279 | mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); | ||
280 | |||
281 | return ret_addr; | ||
256 | } | 282 | } |
257 | 283 | ||
258 | /** | 284 | /** |
@@ -280,7 +306,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) | |||
280 | { | 306 | { |
281 | /* | 307 | /* |
282 | * Ideally, this should be: | 308 | * Ideally, this should be: |
283 | * pat_wc_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; | 309 | * pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS; |
284 | * | 310 | * |
285 | * Till we fix all X drivers to use ioremap_wc(), we will use | 311 | * Till we fix all X drivers to use ioremap_wc(), we will use |
286 | * UC MINUS. | 312 | * UC MINUS. |
@@ -304,7 +330,7 @@ EXPORT_SYMBOL(ioremap_nocache); | |||
304 | */ | 330 | */ |
305 | void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) | 331 | void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) |
306 | { | 332 | { |
307 | if (pat_wc_enabled) | 333 | if (pat_enabled) |
308 | return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, | 334 | return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, |
309 | __builtin_return_address(0)); | 335 | __builtin_return_address(0)); |
310 | else | 336 | else |
@@ -319,6 +345,37 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) | |||
319 | } | 345 | } |
320 | EXPORT_SYMBOL(ioremap_cache); | 346 | EXPORT_SYMBOL(ioremap_cache); |
321 | 347 | ||
348 | static void __iomem *ioremap_default(resource_size_t phys_addr, | ||
349 | unsigned long size) | ||
350 | { | ||
351 | unsigned long flags; | ||
352 | void *ret; | ||
353 | int err; | ||
354 | |||
355 | /* | ||
356 | * - WB for WB-able memory and no other conflicting mappings | ||
357 | * - UC_MINUS for non-WB-able memory with no other conflicting mappings | ||
358 | * - Inherit from confliting mappings otherwise | ||
359 | */ | ||
360 | err = reserve_memtype(phys_addr, phys_addr + size, -1, &flags); | ||
361 | if (err < 0) | ||
362 | return NULL; | ||
363 | |||
364 | ret = (void *) __ioremap_caller(phys_addr, size, flags, | ||
365 | __builtin_return_address(0)); | ||
366 | |||
367 | free_memtype(phys_addr, phys_addr + size); | ||
368 | return (void __iomem *)ret; | ||
369 | } | ||
370 | |||
371 | void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, | ||
372 | unsigned long prot_val) | ||
373 | { | ||
374 | return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK), | ||
375 | __builtin_return_address(0)); | ||
376 | } | ||
377 | EXPORT_SYMBOL(ioremap_prot); | ||
378 | |||
322 | /** | 379 | /** |
323 | * iounmap - Free a IO remapping | 380 | * iounmap - Free a IO remapping |
324 | * @addr: virtual address from ioremap_* | 381 | * @addr: virtual address from ioremap_* |
@@ -337,13 +394,15 @@ void iounmap(volatile void __iomem *addr) | |||
337 | * vm_area and by simply returning an address into the kernel mapping | 394 | * vm_area and by simply returning an address into the kernel mapping |
338 | * of ISA space. So handle that here. | 395 | * of ISA space. So handle that here. |
339 | */ | 396 | */ |
340 | if (addr >= phys_to_virt(ISA_START_ADDRESS) && | 397 | if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && |
341 | addr < phys_to_virt(ISA_END_ADDRESS)) | 398 | (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) |
342 | return; | 399 | return; |
343 | 400 | ||
344 | addr = (volatile void __iomem *) | 401 | addr = (volatile void __iomem *) |
345 | (PAGE_MASK & (unsigned long __force)addr); | 402 | (PAGE_MASK & (unsigned long __force)addr); |
346 | 403 | ||
404 | mmiotrace_iounmap(addr); | ||
405 | |||
347 | /* Use the vm area unlocked, assuming the caller | 406 | /* Use the vm area unlocked, assuming the caller |
348 | ensures there isn't another iounmap for the same address | 407 | ensures there isn't another iounmap for the same address |
349 | in parallel. Reuse of the virtual address is prevented by | 408 | in parallel. Reuse of the virtual address is prevented by |
@@ -351,7 +410,7 @@ void iounmap(volatile void __iomem *addr) | |||
351 | cpa takes care of the direct mappings. */ | 410 | cpa takes care of the direct mappings. */ |
352 | read_lock(&vmlist_lock); | 411 | read_lock(&vmlist_lock); |
353 | for (p = vmlist; p; p = p->next) { | 412 | for (p = vmlist; p; p = p->next) { |
354 | if (p->addr == addr) | 413 | if (p->addr == (void __force *)addr) |
355 | break; | 414 | break; |
356 | } | 415 | } |
357 | read_unlock(&vmlist_lock); | 416 | read_unlock(&vmlist_lock); |
@@ -365,7 +424,7 @@ void iounmap(volatile void __iomem *addr) | |||
365 | free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); | 424 | free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p)); |
366 | 425 | ||
367 | /* Finally remove it */ | 426 | /* Finally remove it */ |
368 | o = remove_vm_area((void *)addr); | 427 | o = remove_vm_area((void __force *)addr); |
369 | BUG_ON(p != o || o == NULL); | 428 | BUG_ON(p != o || o == NULL); |
370 | kfree(p); | 429 | kfree(p); |
371 | } | 430 | } |
@@ -384,7 +443,7 @@ void *xlate_dev_mem_ptr(unsigned long phys) | |||
384 | if (page_is_ram(start >> PAGE_SHIFT)) | 443 | if (page_is_ram(start >> PAGE_SHIFT)) |
385 | return __va(phys); | 444 | return __va(phys); |
386 | 445 | ||
387 | addr = (void *)ioremap(start, PAGE_SIZE); | 446 | addr = (void __force *)ioremap_default(start, PAGE_SIZE); |
388 | if (addr) | 447 | if (addr) |
389 | addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); | 448 | addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); |
390 | 449 | ||
@@ -400,9 +459,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr) | |||
400 | return; | 459 | return; |
401 | } | 460 | } |
402 | 461 | ||
403 | #ifdef CONFIG_X86_32 | 462 | static int __initdata early_ioremap_debug; |
404 | |||
405 | int __initdata early_ioremap_debug; | ||
406 | 463 | ||
407 | static int __init early_ioremap_debug_setup(char *str) | 464 | static int __init early_ioremap_debug_setup(char *str) |
408 | { | 465 | { |
@@ -413,8 +470,7 @@ static int __init early_ioremap_debug_setup(char *str) | |||
413 | early_param("early_ioremap_debug", early_ioremap_debug_setup); | 470 | early_param("early_ioremap_debug", early_ioremap_debug_setup); |
414 | 471 | ||
415 | static __initdata int after_paging_init; | 472 | static __initdata int after_paging_init; |
416 | static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] | 473 | static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; |
417 | __section(.bss.page_aligned); | ||
418 | 474 | ||
419 | static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) | 475 | static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) |
420 | { | 476 | { |
@@ -503,10 +559,11 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, | |||
503 | return; | 559 | return; |
504 | } | 560 | } |
505 | pte = early_ioremap_pte(addr); | 561 | pte = early_ioremap_pte(addr); |
562 | |||
506 | if (pgprot_val(flags)) | 563 | if (pgprot_val(flags)) |
507 | set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); | 564 | set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); |
508 | else | 565 | else |
509 | pte_clear(NULL, addr, pte); | 566 | pte_clear(&init_mm, addr, pte); |
510 | __flush_tlb_one(addr); | 567 | __flush_tlb_one(addr); |
511 | } | 568 | } |
512 | 569 | ||
@@ -528,19 +585,17 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx) | |||
528 | } | 585 | } |
529 | 586 | ||
530 | 587 | ||
531 | int __initdata early_ioremap_nested; | 588 | static int __initdata early_ioremap_nested; |
532 | 589 | ||
533 | static int __init check_early_ioremap_leak(void) | 590 | static int __init check_early_ioremap_leak(void) |
534 | { | 591 | { |
535 | if (!early_ioremap_nested) | 592 | if (!early_ioremap_nested) |
536 | return 0; | 593 | return 0; |
537 | 594 | WARN(1, KERN_WARNING | |
538 | printk(KERN_WARNING | ||
539 | "Debug warning: early ioremap leak of %d areas detected.\n", | 595 | "Debug warning: early ioremap leak of %d areas detected.\n", |
540 | early_ioremap_nested); | 596 | early_ioremap_nested); |
541 | printk(KERN_WARNING | 597 | printk(KERN_WARNING |
542 | "please boot with early_ioremap_debug and report the dmesg.\n"); | 598 | "please boot with early_ioremap_debug and report the dmesg.\n"); |
543 | WARN_ON(1); | ||
544 | 599 | ||
545 | return 1; | 600 | return 1; |
546 | } | 601 | } |
@@ -578,7 +633,7 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) | |||
578 | */ | 633 | */ |
579 | offset = phys_addr & ~PAGE_MASK; | 634 | offset = phys_addr & ~PAGE_MASK; |
580 | phys_addr &= PAGE_MASK; | 635 | phys_addr &= PAGE_MASK; |
581 | size = PAGE_ALIGN(last_addr) - phys_addr; | 636 | size = PAGE_ALIGN(last_addr + 1) - phys_addr; |
582 | 637 | ||
583 | /* | 638 | /* |
584 | * Mappings have to fit in the FIX_BTMAP area. | 639 | * Mappings have to fit in the FIX_BTMAP area. |
@@ -644,5 +699,3 @@ void __this_fixmap_does_not_exist(void) | |||
644 | { | 699 | { |
645 | WARN_ON(1); | 700 | WARN_ON(1); |
646 | } | 701 | } |
647 | |||
648 | #endif /* CONFIG_X86_32 */ | ||
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index 1f476e477844..41f1b5c00a1d 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <asm/numa.h> | 22 | #include <asm/numa.h> |
23 | #include <asm/mpspec.h> | 23 | #include <asm/mpspec.h> |
24 | #include <asm/apic.h> | 24 | #include <asm/apic.h> |
25 | #include <asm/k8.h> | ||
25 | 26 | ||
26 | static __init int find_northbridge(void) | 27 | static __init int find_northbridge(void) |
27 | { | 28 | { |
@@ -56,34 +57,33 @@ static __init void early_get_boot_cpu_id(void) | |||
56 | /* | 57 | /* |
57 | * Find possible boot-time SMP configuration: | 58 | * Find possible boot-time SMP configuration: |
58 | */ | 59 | */ |
60 | #ifdef CONFIG_X86_MPPARSE | ||
59 | early_find_smp_config(); | 61 | early_find_smp_config(); |
62 | #endif | ||
60 | #ifdef CONFIG_ACPI | 63 | #ifdef CONFIG_ACPI |
61 | /* | 64 | /* |
62 | * Read APIC information from ACPI tables. | 65 | * Read APIC information from ACPI tables. |
63 | */ | 66 | */ |
64 | early_acpi_boot_init(); | 67 | early_acpi_boot_init(); |
65 | #endif | 68 | #endif |
69 | #ifdef CONFIG_X86_MPPARSE | ||
66 | /* | 70 | /* |
67 | * get boot-time SMP configuration: | 71 | * get boot-time SMP configuration: |
68 | */ | 72 | */ |
69 | if (smp_found_config) | 73 | if (smp_found_config) |
70 | early_get_smp_config(); | 74 | early_get_smp_config(); |
75 | #endif | ||
71 | early_init_lapic_mapping(); | 76 | early_init_lapic_mapping(); |
72 | } | 77 | } |
73 | 78 | ||
74 | int __init k8_scan_nodes(unsigned long start, unsigned long end) | 79 | int __init k8_scan_nodes(unsigned long start, unsigned long end) |
75 | { | 80 | { |
81 | unsigned numnodes, cores, bits, apicid_base; | ||
76 | unsigned long prevbase; | 82 | unsigned long prevbase; |
77 | struct bootnode nodes[8]; | 83 | struct bootnode nodes[8]; |
78 | int nodeid, i, nb; | ||
79 | unsigned char nodeids[8]; | 84 | unsigned char nodeids[8]; |
80 | int found = 0; | 85 | int i, j, nb, found = 0; |
81 | u32 reg; | 86 | u32 nodeid, reg; |
82 | unsigned numnodes; | ||
83 | unsigned cores; | ||
84 | unsigned bits; | ||
85 | int j; | ||
86 | unsigned apicid_base; | ||
87 | 87 | ||
88 | if (!early_pci_allowed()) | 88 | if (!early_pci_allowed()) |
89 | return -1; | 89 | return -1; |
@@ -105,7 +105,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) | |||
105 | prevbase = 0; | 105 | prevbase = 0; |
106 | for (i = 0; i < 8; i++) { | 106 | for (i = 0; i < 8; i++) { |
107 | unsigned long base, limit; | 107 | unsigned long base, limit; |
108 | u32 nodeid; | ||
109 | 108 | ||
110 | base = read_pci_config(0, nb, 1, 0x40 + i*8); | 109 | base = read_pci_config(0, nb, 1, 0x40 + i*8); |
111 | limit = read_pci_config(0, nb, 1, 0x44 + i*8); | 110 | limit = read_pci_config(0, nb, 1, 0x44 + i*8); |
@@ -144,8 +143,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) | |||
144 | limit |= (1<<24)-1; | 143 | limit |= (1<<24)-1; |
145 | limit++; | 144 | limit++; |
146 | 145 | ||
147 | if (limit > end_pfn << PAGE_SHIFT) | 146 | if (limit > max_pfn << PAGE_SHIFT) |
148 | limit = end_pfn << PAGE_SHIFT; | 147 | limit = max_pfn << PAGE_SHIFT; |
149 | if (limit <= base) | 148 | if (limit <= base) |
150 | continue; | 149 | continue; |
151 | 150 | ||
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c new file mode 100644 index 000000000000..93d82038af4b --- /dev/null +++ b/arch/x86/mm/kmmio.c | |||
@@ -0,0 +1,510 @@ | |||
1 | /* Support for MMIO probes. | ||
2 | * Benfit many code from kprobes | ||
3 | * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. | ||
4 | * 2007 Alexander Eichner | ||
5 | * 2008 Pekka Paalanen <pq@iki.fi> | ||
6 | */ | ||
7 | |||
8 | #include <linux/list.h> | ||
9 | #include <linux/rculist.h> | ||
10 | #include <linux/spinlock.h> | ||
11 | #include <linux/hash.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/uaccess.h> | ||
16 | #include <linux/ptrace.h> | ||
17 | #include <linux/preempt.h> | ||
18 | #include <linux/percpu.h> | ||
19 | #include <linux/kdebug.h> | ||
20 | #include <linux/mutex.h> | ||
21 | #include <linux/io.h> | ||
22 | #include <asm/cacheflush.h> | ||
23 | #include <asm/tlbflush.h> | ||
24 | #include <linux/errno.h> | ||
25 | #include <asm/debugreg.h> | ||
26 | #include <linux/mmiotrace.h> | ||
27 | |||
28 | #define KMMIO_PAGE_HASH_BITS 4 | ||
29 | #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) | ||
30 | |||
31 | struct kmmio_fault_page { | ||
32 | struct list_head list; | ||
33 | struct kmmio_fault_page *release_next; | ||
34 | unsigned long page; /* location of the fault page */ | ||
35 | |||
36 | /* | ||
37 | * Number of times this page has been registered as a part | ||
38 | * of a probe. If zero, page is disarmed and this may be freed. | ||
39 | * Used only by writers (RCU). | ||
40 | */ | ||
41 | int count; | ||
42 | }; | ||
43 | |||
44 | struct kmmio_delayed_release { | ||
45 | struct rcu_head rcu; | ||
46 | struct kmmio_fault_page *release_list; | ||
47 | }; | ||
48 | |||
49 | struct kmmio_context { | ||
50 | struct kmmio_fault_page *fpage; | ||
51 | struct kmmio_probe *probe; | ||
52 | unsigned long saved_flags; | ||
53 | unsigned long addr; | ||
54 | int active; | ||
55 | }; | ||
56 | |||
57 | static DEFINE_SPINLOCK(kmmio_lock); | ||
58 | |||
59 | /* Protected by kmmio_lock */ | ||
60 | unsigned int kmmio_count; | ||
61 | |||
62 | /* Read-protected by RCU, write-protected by kmmio_lock. */ | ||
63 | static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; | ||
64 | static LIST_HEAD(kmmio_probes); | ||
65 | |||
66 | static struct list_head *kmmio_page_list(unsigned long page) | ||
67 | { | ||
68 | return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; | ||
69 | } | ||
70 | |||
71 | /* Accessed per-cpu */ | ||
72 | static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); | ||
73 | |||
74 | /* | ||
75 | * this is basically a dynamic stabbing problem: | ||
76 | * Could use the existing prio tree code or | ||
77 | * Possible better implementations: | ||
78 | * The Interval Skip List: A Data Structure for Finding All Intervals That | ||
79 | * Overlap a Point (might be simple) | ||
80 | * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup | ||
81 | */ | ||
82 | /* Get the kmmio at this addr (if any). You must be holding RCU read lock. */ | ||
83 | static struct kmmio_probe *get_kmmio_probe(unsigned long addr) | ||
84 | { | ||
85 | struct kmmio_probe *p; | ||
86 | list_for_each_entry_rcu(p, &kmmio_probes, list) { | ||
87 | if (addr >= p->addr && addr <= (p->addr + p->len)) | ||
88 | return p; | ||
89 | } | ||
90 | return NULL; | ||
91 | } | ||
92 | |||
93 | /* You must be holding RCU read lock. */ | ||
94 | static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) | ||
95 | { | ||
96 | struct list_head *head; | ||
97 | struct kmmio_fault_page *p; | ||
98 | |||
99 | page &= PAGE_MASK; | ||
100 | head = kmmio_page_list(page); | ||
101 | list_for_each_entry_rcu(p, head, list) { | ||
102 | if (p->page == page) | ||
103 | return p; | ||
104 | } | ||
105 | return NULL; | ||
106 | } | ||
107 | |||
108 | static void set_page_present(unsigned long addr, bool present, | ||
109 | unsigned int *pglevel) | ||
110 | { | ||
111 | pteval_t pteval; | ||
112 | pmdval_t pmdval; | ||
113 | unsigned int level; | ||
114 | pmd_t *pmd; | ||
115 | pte_t *pte = lookup_address(addr, &level); | ||
116 | |||
117 | if (!pte) { | ||
118 | pr_err("kmmio: no pte for page 0x%08lx\n", addr); | ||
119 | return; | ||
120 | } | ||
121 | |||
122 | if (pglevel) | ||
123 | *pglevel = level; | ||
124 | |||
125 | switch (level) { | ||
126 | case PG_LEVEL_2M: | ||
127 | pmd = (pmd_t *)pte; | ||
128 | pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT; | ||
129 | if (present) | ||
130 | pmdval |= _PAGE_PRESENT; | ||
131 | set_pmd(pmd, __pmd(pmdval)); | ||
132 | break; | ||
133 | |||
134 | case PG_LEVEL_4K: | ||
135 | pteval = pte_val(*pte) & ~_PAGE_PRESENT; | ||
136 | if (present) | ||
137 | pteval |= _PAGE_PRESENT; | ||
138 | set_pte_atomic(pte, __pte(pteval)); | ||
139 | break; | ||
140 | |||
141 | default: | ||
142 | pr_err("kmmio: unexpected page level 0x%x.\n", level); | ||
143 | return; | ||
144 | } | ||
145 | |||
146 | __flush_tlb_one(addr); | ||
147 | } | ||
148 | |||
149 | /** Mark the given page as not present. Access to it will trigger a fault. */ | ||
150 | static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) | ||
151 | { | ||
152 | set_page_present(page & PAGE_MASK, false, pglevel); | ||
153 | } | ||
154 | |||
155 | /** Mark the given page as present. */ | ||
156 | static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel) | ||
157 | { | ||
158 | set_page_present(page & PAGE_MASK, true, pglevel); | ||
159 | } | ||
160 | |||
161 | /* | ||
162 | * This is being called from do_page_fault(). | ||
163 | * | ||
164 | * We may be in an interrupt or a critical section. Also prefecthing may | ||
165 | * trigger a page fault. We may be in the middle of process switch. | ||
166 | * We cannot take any locks, because we could be executing especially | ||
167 | * within a kmmio critical section. | ||
168 | * | ||
169 | * Local interrupts are disabled, so preemption cannot happen. | ||
170 | * Do not enable interrupts, do not sleep, and watch out for other CPUs. | ||
171 | */ | ||
172 | /* | ||
173 | * Interrupts are disabled on entry as trap3 is an interrupt gate | ||
174 | * and they remain disabled thorough out this function. | ||
175 | */ | ||
176 | int kmmio_handler(struct pt_regs *regs, unsigned long addr) | ||
177 | { | ||
178 | struct kmmio_context *ctx; | ||
179 | struct kmmio_fault_page *faultpage; | ||
180 | int ret = 0; /* default to fault not handled */ | ||
181 | |||
182 | /* | ||
183 | * Preemption is now disabled to prevent process switch during | ||
184 | * single stepping. We can only handle one active kmmio trace | ||
185 | * per cpu, so ensure that we finish it before something else | ||
186 | * gets to run. We also hold the RCU read lock over single | ||
187 | * stepping to avoid looking up the probe and kmmio_fault_page | ||
188 | * again. | ||
189 | */ | ||
190 | preempt_disable(); | ||
191 | rcu_read_lock(); | ||
192 | |||
193 | faultpage = get_kmmio_fault_page(addr); | ||
194 | if (!faultpage) { | ||
195 | /* | ||
196 | * Either this page fault is not caused by kmmio, or | ||
197 | * another CPU just pulled the kmmio probe from under | ||
198 | * our feet. The latter case should not be possible. | ||
199 | */ | ||
200 | goto no_kmmio; | ||
201 | } | ||
202 | |||
203 | ctx = &get_cpu_var(kmmio_ctx); | ||
204 | if (ctx->active) { | ||
205 | disarm_kmmio_fault_page(faultpage->page, NULL); | ||
206 | if (addr == ctx->addr) { | ||
207 | /* | ||
208 | * On SMP we sometimes get recursive probe hits on the | ||
209 | * same address. Context is already saved, fall out. | ||
210 | */ | ||
211 | pr_debug("kmmio: duplicate probe hit on CPU %d, for " | ||
212 | "address 0x%08lx.\n", | ||
213 | smp_processor_id(), addr); | ||
214 | ret = 1; | ||
215 | goto no_kmmio_ctx; | ||
216 | } | ||
217 | /* | ||
218 | * Prevent overwriting already in-flight context. | ||
219 | * This should not happen, let's hope disarming at least | ||
220 | * prevents a panic. | ||
221 | */ | ||
222 | pr_emerg("kmmio: recursive probe hit on CPU %d, " | ||
223 | "for address 0x%08lx. Ignoring.\n", | ||
224 | smp_processor_id(), addr); | ||
225 | pr_emerg("kmmio: previous hit was at 0x%08lx.\n", | ||
226 | ctx->addr); | ||
227 | goto no_kmmio_ctx; | ||
228 | } | ||
229 | ctx->active++; | ||
230 | |||
231 | ctx->fpage = faultpage; | ||
232 | ctx->probe = get_kmmio_probe(addr); | ||
233 | ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); | ||
234 | ctx->addr = addr; | ||
235 | |||
236 | if (ctx->probe && ctx->probe->pre_handler) | ||
237 | ctx->probe->pre_handler(ctx->probe, regs, addr); | ||
238 | |||
239 | /* | ||
240 | * Enable single-stepping and disable interrupts for the faulting | ||
241 | * context. Local interrupts must not get enabled during stepping. | ||
242 | */ | ||
243 | regs->flags |= X86_EFLAGS_TF; | ||
244 | regs->flags &= ~X86_EFLAGS_IF; | ||
245 | |||
246 | /* Now we set present bit in PTE and single step. */ | ||
247 | disarm_kmmio_fault_page(ctx->fpage->page, NULL); | ||
248 | |||
249 | /* | ||
250 | * If another cpu accesses the same page while we are stepping, | ||
251 | * the access will not be caught. It will simply succeed and the | ||
252 | * only downside is we lose the event. If this becomes a problem, | ||
253 | * the user should drop to single cpu before tracing. | ||
254 | */ | ||
255 | |||
256 | put_cpu_var(kmmio_ctx); | ||
257 | return 1; /* fault handled */ | ||
258 | |||
259 | no_kmmio_ctx: | ||
260 | put_cpu_var(kmmio_ctx); | ||
261 | no_kmmio: | ||
262 | rcu_read_unlock(); | ||
263 | preempt_enable_no_resched(); | ||
264 | return ret; | ||
265 | } | ||
266 | |||
267 | /* | ||
268 | * Interrupts are disabled on entry as trap1 is an interrupt gate | ||
269 | * and they remain disabled thorough out this function. | ||
270 | * This must always get called as the pair to kmmio_handler(). | ||
271 | */ | ||
272 | static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) | ||
273 | { | ||
274 | int ret = 0; | ||
275 | struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); | ||
276 | |||
277 | if (!ctx->active) { | ||
278 | pr_debug("kmmio: spurious debug trap on CPU %d.\n", | ||
279 | smp_processor_id()); | ||
280 | goto out; | ||
281 | } | ||
282 | |||
283 | if (ctx->probe && ctx->probe->post_handler) | ||
284 | ctx->probe->post_handler(ctx->probe, condition, regs); | ||
285 | |||
286 | arm_kmmio_fault_page(ctx->fpage->page, NULL); | ||
287 | |||
288 | regs->flags &= ~X86_EFLAGS_TF; | ||
289 | regs->flags |= ctx->saved_flags; | ||
290 | |||
291 | /* These were acquired in kmmio_handler(). */ | ||
292 | ctx->active--; | ||
293 | BUG_ON(ctx->active); | ||
294 | rcu_read_unlock(); | ||
295 | preempt_enable_no_resched(); | ||
296 | |||
297 | /* | ||
298 | * if somebody else is singlestepping across a probe point, flags | ||
299 | * will have TF set, in which case, continue the remaining processing | ||
300 | * of do_debug, as if this is not a probe hit. | ||
301 | */ | ||
302 | if (!(regs->flags & X86_EFLAGS_TF)) | ||
303 | ret = 1; | ||
304 | out: | ||
305 | put_cpu_var(kmmio_ctx); | ||
306 | return ret; | ||
307 | } | ||
308 | |||
309 | /* You must be holding kmmio_lock. */ | ||
310 | static int add_kmmio_fault_page(unsigned long page) | ||
311 | { | ||
312 | struct kmmio_fault_page *f; | ||
313 | |||
314 | page &= PAGE_MASK; | ||
315 | f = get_kmmio_fault_page(page); | ||
316 | if (f) { | ||
317 | if (!f->count) | ||
318 | arm_kmmio_fault_page(f->page, NULL); | ||
319 | f->count++; | ||
320 | return 0; | ||
321 | } | ||
322 | |||
323 | f = kmalloc(sizeof(*f), GFP_ATOMIC); | ||
324 | if (!f) | ||
325 | return -1; | ||
326 | |||
327 | f->count = 1; | ||
328 | f->page = page; | ||
329 | list_add_rcu(&f->list, kmmio_page_list(f->page)); | ||
330 | |||
331 | arm_kmmio_fault_page(f->page, NULL); | ||
332 | |||
333 | return 0; | ||
334 | } | ||
335 | |||
336 | /* You must be holding kmmio_lock. */ | ||
337 | static void release_kmmio_fault_page(unsigned long page, | ||
338 | struct kmmio_fault_page **release_list) | ||
339 | { | ||
340 | struct kmmio_fault_page *f; | ||
341 | |||
342 | page &= PAGE_MASK; | ||
343 | f = get_kmmio_fault_page(page); | ||
344 | if (!f) | ||
345 | return; | ||
346 | |||
347 | f->count--; | ||
348 | BUG_ON(f->count < 0); | ||
349 | if (!f->count) { | ||
350 | disarm_kmmio_fault_page(f->page, NULL); | ||
351 | f->release_next = *release_list; | ||
352 | *release_list = f; | ||
353 | } | ||
354 | } | ||
355 | |||
356 | /* | ||
357 | * With page-unaligned ioremaps, one or two armed pages may contain | ||
358 | * addresses from outside the intended mapping. Events for these addresses | ||
359 | * are currently silently dropped. The events may result only from programming | ||
360 | * mistakes by accessing addresses before the beginning or past the end of a | ||
361 | * mapping. | ||
362 | */ | ||
363 | int register_kmmio_probe(struct kmmio_probe *p) | ||
364 | { | ||
365 | unsigned long flags; | ||
366 | int ret = 0; | ||
367 | unsigned long size = 0; | ||
368 | const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); | ||
369 | |||
370 | spin_lock_irqsave(&kmmio_lock, flags); | ||
371 | if (get_kmmio_probe(p->addr)) { | ||
372 | ret = -EEXIST; | ||
373 | goto out; | ||
374 | } | ||
375 | kmmio_count++; | ||
376 | list_add_rcu(&p->list, &kmmio_probes); | ||
377 | while (size < size_lim) { | ||
378 | if (add_kmmio_fault_page(p->addr + size)) | ||
379 | pr_err("kmmio: Unable to set page fault.\n"); | ||
380 | size += PAGE_SIZE; | ||
381 | } | ||
382 | out: | ||
383 | spin_unlock_irqrestore(&kmmio_lock, flags); | ||
384 | /* | ||
385 | * XXX: What should I do here? | ||
386 | * Here was a call to global_flush_tlb(), but it does not exist | ||
387 | * anymore. It seems it's not needed after all. | ||
388 | */ | ||
389 | return ret; | ||
390 | } | ||
391 | EXPORT_SYMBOL(register_kmmio_probe); | ||
392 | |||
393 | static void rcu_free_kmmio_fault_pages(struct rcu_head *head) | ||
394 | { | ||
395 | struct kmmio_delayed_release *dr = container_of( | ||
396 | head, | ||
397 | struct kmmio_delayed_release, | ||
398 | rcu); | ||
399 | struct kmmio_fault_page *p = dr->release_list; | ||
400 | while (p) { | ||
401 | struct kmmio_fault_page *next = p->release_next; | ||
402 | BUG_ON(p->count); | ||
403 | kfree(p); | ||
404 | p = next; | ||
405 | } | ||
406 | kfree(dr); | ||
407 | } | ||
408 | |||
409 | static void remove_kmmio_fault_pages(struct rcu_head *head) | ||
410 | { | ||
411 | struct kmmio_delayed_release *dr = container_of( | ||
412 | head, | ||
413 | struct kmmio_delayed_release, | ||
414 | rcu); | ||
415 | struct kmmio_fault_page *p = dr->release_list; | ||
416 | struct kmmio_fault_page **prevp = &dr->release_list; | ||
417 | unsigned long flags; | ||
418 | spin_lock_irqsave(&kmmio_lock, flags); | ||
419 | while (p) { | ||
420 | if (!p->count) | ||
421 | list_del_rcu(&p->list); | ||
422 | else | ||
423 | *prevp = p->release_next; | ||
424 | prevp = &p->release_next; | ||
425 | p = p->release_next; | ||
426 | } | ||
427 | spin_unlock_irqrestore(&kmmio_lock, flags); | ||
428 | /* This is the real RCU destroy call. */ | ||
429 | call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); | ||
430 | } | ||
431 | |||
432 | /* | ||
433 | * Remove a kmmio probe. You have to synchronize_rcu() before you can be | ||
434 | * sure that the callbacks will not be called anymore. Only after that | ||
435 | * you may actually release your struct kmmio_probe. | ||
436 | * | ||
437 | * Unregistering a kmmio fault page has three steps: | ||
438 | * 1. release_kmmio_fault_page() | ||
439 | * Disarm the page, wait a grace period to let all faults finish. | ||
440 | * 2. remove_kmmio_fault_pages() | ||
441 | * Remove the pages from kmmio_page_table. | ||
442 | * 3. rcu_free_kmmio_fault_pages() | ||
443 | * Actally free the kmmio_fault_page structs as with RCU. | ||
444 | */ | ||
445 | void unregister_kmmio_probe(struct kmmio_probe *p) | ||
446 | { | ||
447 | unsigned long flags; | ||
448 | unsigned long size = 0; | ||
449 | const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); | ||
450 | struct kmmio_fault_page *release_list = NULL; | ||
451 | struct kmmio_delayed_release *drelease; | ||
452 | |||
453 | spin_lock_irqsave(&kmmio_lock, flags); | ||
454 | while (size < size_lim) { | ||
455 | release_kmmio_fault_page(p->addr + size, &release_list); | ||
456 | size += PAGE_SIZE; | ||
457 | } | ||
458 | list_del_rcu(&p->list); | ||
459 | kmmio_count--; | ||
460 | spin_unlock_irqrestore(&kmmio_lock, flags); | ||
461 | |||
462 | drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); | ||
463 | if (!drelease) { | ||
464 | pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); | ||
465 | return; | ||
466 | } | ||
467 | drelease->release_list = release_list; | ||
468 | |||
469 | /* | ||
470 | * This is not really RCU here. We have just disarmed a set of | ||
471 | * pages so that they cannot trigger page faults anymore. However, | ||
472 | * we cannot remove the pages from kmmio_page_table, | ||
473 | * because a probe hit might be in flight on another CPU. The | ||
474 | * pages are collected into a list, and they will be removed from | ||
475 | * kmmio_page_table when it is certain that no probe hit related to | ||
476 | * these pages can be in flight. RCU grace period sounds like a | ||
477 | * good choice. | ||
478 | * | ||
479 | * If we removed the pages too early, kmmio page fault handler might | ||
480 | * not find the respective kmmio_fault_page and determine it's not | ||
481 | * a kmmio fault, when it actually is. This would lead to madness. | ||
482 | */ | ||
483 | call_rcu(&drelease->rcu, remove_kmmio_fault_pages); | ||
484 | } | ||
485 | EXPORT_SYMBOL(unregister_kmmio_probe); | ||
486 | |||
487 | static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, | ||
488 | void *args) | ||
489 | { | ||
490 | struct die_args *arg = args; | ||
491 | |||
492 | if (val == DIE_DEBUG && (arg->err & DR_STEP)) | ||
493 | if (post_kmmio_handler(arg->err, arg->regs) == 1) | ||
494 | return NOTIFY_STOP; | ||
495 | |||
496 | return NOTIFY_DONE; | ||
497 | } | ||
498 | |||
499 | static struct notifier_block nb_die = { | ||
500 | .notifier_call = kmmio_die_notifier | ||
501 | }; | ||
502 | |||
503 | static int __init init_kmmio(void) | ||
504 | { | ||
505 | int i; | ||
506 | for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) | ||
507 | INIT_LIST_HEAD(&kmmio_page_table[i]); | ||
508 | return register_die_notifier(&nb_die); | ||
509 | } | ||
510 | fs_initcall(init_kmmio); /* should be before device_initcall() */ | ||
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c new file mode 100644 index 000000000000..672e17f8262a --- /dev/null +++ b/arch/x86/mm/memtest.c | |||
@@ -0,0 +1,123 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/errno.h> | ||
3 | #include <linux/string.h> | ||
4 | #include <linux/types.h> | ||
5 | #include <linux/mm.h> | ||
6 | #include <linux/smp.h> | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/pfn.h> | ||
9 | |||
10 | #include <asm/e820.h> | ||
11 | |||
12 | static void __init memtest(unsigned long start_phys, unsigned long size, | ||
13 | unsigned pattern) | ||
14 | { | ||
15 | unsigned long i; | ||
16 | unsigned long *start; | ||
17 | unsigned long start_bad; | ||
18 | unsigned long last_bad; | ||
19 | unsigned long val; | ||
20 | unsigned long start_phys_aligned; | ||
21 | unsigned long count; | ||
22 | unsigned long incr; | ||
23 | |||
24 | switch (pattern) { | ||
25 | case 0: | ||
26 | val = 0UL; | ||
27 | break; | ||
28 | case 1: | ||
29 | val = -1UL; | ||
30 | break; | ||
31 | case 2: | ||
32 | #ifdef CONFIG_X86_64 | ||
33 | val = 0x5555555555555555UL; | ||
34 | #else | ||
35 | val = 0x55555555UL; | ||
36 | #endif | ||
37 | break; | ||
38 | case 3: | ||
39 | #ifdef CONFIG_X86_64 | ||
40 | val = 0xaaaaaaaaaaaaaaaaUL; | ||
41 | #else | ||
42 | val = 0xaaaaaaaaUL; | ||
43 | #endif | ||
44 | break; | ||
45 | default: | ||
46 | return; | ||
47 | } | ||
48 | |||
49 | incr = sizeof(unsigned long); | ||
50 | start_phys_aligned = ALIGN(start_phys, incr); | ||
51 | count = (size - (start_phys_aligned - start_phys))/incr; | ||
52 | start = __va(start_phys_aligned); | ||
53 | start_bad = 0; | ||
54 | last_bad = 0; | ||
55 | |||
56 | for (i = 0; i < count; i++) | ||
57 | start[i] = val; | ||
58 | for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { | ||
59 | if (*start != val) { | ||
60 | if (start_phys_aligned == last_bad + incr) { | ||
61 | last_bad += incr; | ||
62 | } else { | ||
63 | if (start_bad) { | ||
64 | printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved", | ||
65 | val, start_bad, last_bad + incr); | ||
66 | reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); | ||
67 | } | ||
68 | start_bad = last_bad = start_phys_aligned; | ||
69 | } | ||
70 | } | ||
71 | } | ||
72 | if (start_bad) { | ||
73 | printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", | ||
74 | val, start_bad, last_bad + incr); | ||
75 | reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); | ||
76 | } | ||
77 | |||
78 | } | ||
79 | |||
80 | /* default is disabled */ | ||
81 | static int memtest_pattern __initdata; | ||
82 | |||
83 | static int __init parse_memtest(char *arg) | ||
84 | { | ||
85 | if (arg) | ||
86 | memtest_pattern = simple_strtoul(arg, NULL, 0); | ||
87 | return 0; | ||
88 | } | ||
89 | |||
90 | early_param("memtest", parse_memtest); | ||
91 | |||
92 | void __init early_memtest(unsigned long start, unsigned long end) | ||
93 | { | ||
94 | u64 t_start, t_size; | ||
95 | unsigned pattern; | ||
96 | |||
97 | if (!memtest_pattern) | ||
98 | return; | ||
99 | |||
100 | printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); | ||
101 | for (pattern = 0; pattern < memtest_pattern; pattern++) { | ||
102 | t_start = start; | ||
103 | t_size = 0; | ||
104 | while (t_start < end) { | ||
105 | t_start = find_e820_area_size(t_start, &t_size, 1); | ||
106 | |||
107 | /* done ? */ | ||
108 | if (t_start >= end) | ||
109 | break; | ||
110 | if (t_start + t_size > end) | ||
111 | t_size = end - t_start; | ||
112 | |||
113 | printk(KERN_CONT "\n %010llx - %010llx pattern %d", | ||
114 | (unsigned long long)t_start, | ||
115 | (unsigned long long)t_start + t_size, pattern); | ||
116 | |||
117 | memtest(t_start, t_size, pattern); | ||
118 | |||
119 | t_start += t_size; | ||
120 | } | ||
121 | } | ||
122 | printk(KERN_CONT "\n"); | ||
123 | } | ||
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c new file mode 100644 index 000000000000..635b50e85581 --- /dev/null +++ b/arch/x86/mm/mmio-mod.c | |||
@@ -0,0 +1,517 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or modify | ||
3 | * it under the terms of the GNU General Public License as published by | ||
4 | * the Free Software Foundation; either version 2 of the License, or | ||
5 | * (at your option) any later version. | ||
6 | * | ||
7 | * This program is distributed in the hope that it will be useful, | ||
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
10 | * GNU General Public License for more details. | ||
11 | * | ||
12 | * You should have received a copy of the GNU General Public License | ||
13 | * along with this program; if not, write to the Free Software | ||
14 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
15 | * | ||
16 | * Copyright (C) IBM Corporation, 2005 | ||
17 | * Jeff Muizelaar, 2006, 2007 | ||
18 | * Pekka Paalanen, 2008 <pq@iki.fi> | ||
19 | * | ||
20 | * Derived from the read-mod example from relay-examples by Tom Zanussi. | ||
21 | */ | ||
22 | #define DEBUG 1 | ||
23 | |||
24 | #include <linux/module.h> | ||
25 | #include <linux/debugfs.h> | ||
26 | #include <linux/uaccess.h> | ||
27 | #include <linux/io.h> | ||
28 | #include <linux/version.h> | ||
29 | #include <linux/kallsyms.h> | ||
30 | #include <asm/pgtable.h> | ||
31 | #include <linux/mmiotrace.h> | ||
32 | #include <asm/e820.h> /* for ISA_START_ADDRESS */ | ||
33 | #include <asm/atomic.h> | ||
34 | #include <linux/percpu.h> | ||
35 | #include <linux/cpu.h> | ||
36 | |||
37 | #include "pf_in.h" | ||
38 | |||
39 | #define NAME "mmiotrace: " | ||
40 | |||
41 | struct trap_reason { | ||
42 | unsigned long addr; | ||
43 | unsigned long ip; | ||
44 | enum reason_type type; | ||
45 | int active_traces; | ||
46 | }; | ||
47 | |||
48 | struct remap_trace { | ||
49 | struct list_head list; | ||
50 | struct kmmio_probe probe; | ||
51 | resource_size_t phys; | ||
52 | unsigned long id; | ||
53 | }; | ||
54 | |||
55 | /* Accessed per-cpu. */ | ||
56 | static DEFINE_PER_CPU(struct trap_reason, pf_reason); | ||
57 | static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); | ||
58 | |||
59 | #if 0 /* XXX: no way gather this info anymore */ | ||
60 | /* Access to this is not per-cpu. */ | ||
61 | static DEFINE_PER_CPU(atomic_t, dropped); | ||
62 | #endif | ||
63 | |||
64 | static struct dentry *marker_file; | ||
65 | |||
66 | static DEFINE_MUTEX(mmiotrace_mutex); | ||
67 | static DEFINE_SPINLOCK(trace_lock); | ||
68 | static atomic_t mmiotrace_enabled; | ||
69 | static LIST_HEAD(trace_list); /* struct remap_trace */ | ||
70 | |||
71 | /* | ||
72 | * Locking in this file: | ||
73 | * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections. | ||
74 | * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex | ||
75 | * and trace_lock. | ||
76 | * - Routines depending on is_enabled() must take trace_lock. | ||
77 | * - trace_list users must hold trace_lock. | ||
78 | * - is_enabled() guarantees that mmio_trace_record is allowed. | ||
79 | * - pre/post callbacks assume the effect of is_enabled() being true. | ||
80 | */ | ||
81 | |||
82 | /* module parameters */ | ||
83 | static unsigned long filter_offset; | ||
84 | static int nommiotrace; | ||
85 | static int trace_pc; | ||
86 | |||
87 | module_param(filter_offset, ulong, 0); | ||
88 | module_param(nommiotrace, bool, 0); | ||
89 | module_param(trace_pc, bool, 0); | ||
90 | |||
91 | MODULE_PARM_DESC(filter_offset, "Start address of traced mappings."); | ||
92 | MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing."); | ||
93 | MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions."); | ||
94 | |||
95 | static bool is_enabled(void) | ||
96 | { | ||
97 | return atomic_read(&mmiotrace_enabled); | ||
98 | } | ||
99 | |||
100 | #if 0 /* XXX: needs rewrite */ | ||
101 | /* | ||
102 | * Write callback for the debugfs entry: | ||
103 | * Read a marker and write it to the mmio trace log | ||
104 | */ | ||
105 | static ssize_t write_marker(struct file *file, const char __user *buffer, | ||
106 | size_t count, loff_t *ppos) | ||
107 | { | ||
108 | char *event = NULL; | ||
109 | struct mm_io_header *headp; | ||
110 | ssize_t len = (count > 65535) ? 65535 : count; | ||
111 | |||
112 | event = kzalloc(sizeof(*headp) + len, GFP_KERNEL); | ||
113 | if (!event) | ||
114 | return -ENOMEM; | ||
115 | |||
116 | headp = (struct mm_io_header *)event; | ||
117 | headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT); | ||
118 | headp->data_len = len; | ||
119 | |||
120 | if (copy_from_user(event + sizeof(*headp), buffer, len)) { | ||
121 | kfree(event); | ||
122 | return -EFAULT; | ||
123 | } | ||
124 | |||
125 | spin_lock_irq(&trace_lock); | ||
126 | #if 0 /* XXX: convert this to use tracing */ | ||
127 | if (is_enabled()) | ||
128 | relay_write(chan, event, sizeof(*headp) + len); | ||
129 | else | ||
130 | #endif | ||
131 | len = -EINVAL; | ||
132 | spin_unlock_irq(&trace_lock); | ||
133 | kfree(event); | ||
134 | return len; | ||
135 | } | ||
136 | #endif | ||
137 | |||
138 | static void print_pte(unsigned long address) | ||
139 | { | ||
140 | unsigned int level; | ||
141 | pte_t *pte = lookup_address(address, &level); | ||
142 | |||
143 | if (!pte) { | ||
144 | pr_err(NAME "Error in %s: no pte for page 0x%08lx\n", | ||
145 | __func__, address); | ||
146 | return; | ||
147 | } | ||
148 | |||
149 | if (level == PG_LEVEL_2M) { | ||
150 | pr_emerg(NAME "4MB pages are not currently supported: " | ||
151 | "0x%08lx\n", address); | ||
152 | BUG(); | ||
153 | } | ||
154 | pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address, | ||
155 | (unsigned long long)pte_val(*pte), | ||
156 | (unsigned long long)pte_val(*pte) & _PAGE_PRESENT); | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * For some reason the pre/post pairs have been called in an | ||
161 | * unmatched order. Report and die. | ||
162 | */ | ||
163 | static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) | ||
164 | { | ||
165 | const struct trap_reason *my_reason = &get_cpu_var(pf_reason); | ||
166 | pr_emerg(NAME "unexpected fault for address: 0x%08lx, " | ||
167 | "last fault for address: 0x%08lx\n", | ||
168 | addr, my_reason->addr); | ||
169 | print_pte(addr); | ||
170 | print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); | ||
171 | print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); | ||
172 | #ifdef __i386__ | ||
173 | pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", | ||
174 | regs->ax, regs->bx, regs->cx, regs->dx); | ||
175 | pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", | ||
176 | regs->si, regs->di, regs->bp, regs->sp); | ||
177 | #else | ||
178 | pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", | ||
179 | regs->ax, regs->cx, regs->dx); | ||
180 | pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", | ||
181 | regs->si, regs->di, regs->bp, regs->sp); | ||
182 | #endif | ||
183 | put_cpu_var(pf_reason); | ||
184 | BUG(); | ||
185 | } | ||
186 | |||
187 | static void pre(struct kmmio_probe *p, struct pt_regs *regs, | ||
188 | unsigned long addr) | ||
189 | { | ||
190 | struct trap_reason *my_reason = &get_cpu_var(pf_reason); | ||
191 | struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); | ||
192 | const unsigned long instptr = instruction_pointer(regs); | ||
193 | const enum reason_type type = get_ins_type(instptr); | ||
194 | struct remap_trace *trace = p->private; | ||
195 | |||
196 | /* it doesn't make sense to have more than one active trace per cpu */ | ||
197 | if (my_reason->active_traces) | ||
198 | die_kmmio_nesting_error(regs, addr); | ||
199 | else | ||
200 | my_reason->active_traces++; | ||
201 | |||
202 | my_reason->type = type; | ||
203 | my_reason->addr = addr; | ||
204 | my_reason->ip = instptr; | ||
205 | |||
206 | my_trace->phys = addr - trace->probe.addr + trace->phys; | ||
207 | my_trace->map_id = trace->id; | ||
208 | |||
209 | /* | ||
210 | * Only record the program counter when requested. | ||
211 | * It may taint clean-room reverse engineering. | ||
212 | */ | ||
213 | if (trace_pc) | ||
214 | my_trace->pc = instptr; | ||
215 | else | ||
216 | my_trace->pc = 0; | ||
217 | |||
218 | /* | ||
219 | * XXX: the timestamp recorded will be *after* the tracing has been | ||
220 | * done, not at the time we hit the instruction. SMP implications | ||
221 | * on event ordering? | ||
222 | */ | ||
223 | |||
224 | switch (type) { | ||
225 | case REG_READ: | ||
226 | my_trace->opcode = MMIO_READ; | ||
227 | my_trace->width = get_ins_mem_width(instptr); | ||
228 | break; | ||
229 | case REG_WRITE: | ||
230 | my_trace->opcode = MMIO_WRITE; | ||
231 | my_trace->width = get_ins_mem_width(instptr); | ||
232 | my_trace->value = get_ins_reg_val(instptr, regs); | ||
233 | break; | ||
234 | case IMM_WRITE: | ||
235 | my_trace->opcode = MMIO_WRITE; | ||
236 | my_trace->width = get_ins_mem_width(instptr); | ||
237 | my_trace->value = get_ins_imm_val(instptr); | ||
238 | break; | ||
239 | default: | ||
240 | { | ||
241 | unsigned char *ip = (unsigned char *)instptr; | ||
242 | my_trace->opcode = MMIO_UNKNOWN_OP; | ||
243 | my_trace->width = 0; | ||
244 | my_trace->value = (*ip) << 16 | *(ip + 1) << 8 | | ||
245 | *(ip + 2); | ||
246 | } | ||
247 | } | ||
248 | put_cpu_var(cpu_trace); | ||
249 | put_cpu_var(pf_reason); | ||
250 | } | ||
251 | |||
252 | static void post(struct kmmio_probe *p, unsigned long condition, | ||
253 | struct pt_regs *regs) | ||
254 | { | ||
255 | struct trap_reason *my_reason = &get_cpu_var(pf_reason); | ||
256 | struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace); | ||
257 | |||
258 | /* this should always return the active_trace count to 0 */ | ||
259 | my_reason->active_traces--; | ||
260 | if (my_reason->active_traces) { | ||
261 | pr_emerg(NAME "unexpected post handler"); | ||
262 | BUG(); | ||
263 | } | ||
264 | |||
265 | switch (my_reason->type) { | ||
266 | case REG_READ: | ||
267 | my_trace->value = get_ins_reg_val(my_reason->ip, regs); | ||
268 | break; | ||
269 | default: | ||
270 | break; | ||
271 | } | ||
272 | |||
273 | mmio_trace_rw(my_trace); | ||
274 | put_cpu_var(cpu_trace); | ||
275 | put_cpu_var(pf_reason); | ||
276 | } | ||
277 | |||
278 | static void ioremap_trace_core(resource_size_t offset, unsigned long size, | ||
279 | void __iomem *addr) | ||
280 | { | ||
281 | static atomic_t next_id; | ||
282 | struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL); | ||
283 | /* These are page-unaligned. */ | ||
284 | struct mmiotrace_map map = { | ||
285 | .phys = offset, | ||
286 | .virt = (unsigned long)addr, | ||
287 | .len = size, | ||
288 | .opcode = MMIO_PROBE | ||
289 | }; | ||
290 | |||
291 | if (!trace) { | ||
292 | pr_err(NAME "kmalloc failed in ioremap\n"); | ||
293 | return; | ||
294 | } | ||
295 | |||
296 | *trace = (struct remap_trace) { | ||
297 | .probe = { | ||
298 | .addr = (unsigned long)addr, | ||
299 | .len = size, | ||
300 | .pre_handler = pre, | ||
301 | .post_handler = post, | ||
302 | .private = trace | ||
303 | }, | ||
304 | .phys = offset, | ||
305 | .id = atomic_inc_return(&next_id) | ||
306 | }; | ||
307 | map.map_id = trace->id; | ||
308 | |||
309 | spin_lock_irq(&trace_lock); | ||
310 | if (!is_enabled()) | ||
311 | goto not_enabled; | ||
312 | |||
313 | mmio_trace_mapping(&map); | ||
314 | list_add_tail(&trace->list, &trace_list); | ||
315 | if (!nommiotrace) | ||
316 | register_kmmio_probe(&trace->probe); | ||
317 | |||
318 | not_enabled: | ||
319 | spin_unlock_irq(&trace_lock); | ||
320 | } | ||
321 | |||
322 | void mmiotrace_ioremap(resource_size_t offset, unsigned long size, | ||
323 | void __iomem *addr) | ||
324 | { | ||
325 | if (!is_enabled()) /* recheck and proper locking in *_core() */ | ||
326 | return; | ||
327 | |||
328 | pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n", | ||
329 | (unsigned long long)offset, size, addr); | ||
330 | if ((filter_offset) && (offset != filter_offset)) | ||
331 | return; | ||
332 | ioremap_trace_core(offset, size, addr); | ||
333 | } | ||
334 | |||
335 | static void iounmap_trace_core(volatile void __iomem *addr) | ||
336 | { | ||
337 | struct mmiotrace_map map = { | ||
338 | .phys = 0, | ||
339 | .virt = (unsigned long)addr, | ||
340 | .len = 0, | ||
341 | .opcode = MMIO_UNPROBE | ||
342 | }; | ||
343 | struct remap_trace *trace; | ||
344 | struct remap_trace *tmp; | ||
345 | struct remap_trace *found_trace = NULL; | ||
346 | |||
347 | pr_debug(NAME "Unmapping %p.\n", addr); | ||
348 | |||
349 | spin_lock_irq(&trace_lock); | ||
350 | if (!is_enabled()) | ||
351 | goto not_enabled; | ||
352 | |||
353 | list_for_each_entry_safe(trace, tmp, &trace_list, list) { | ||
354 | if ((unsigned long)addr == trace->probe.addr) { | ||
355 | if (!nommiotrace) | ||
356 | unregister_kmmio_probe(&trace->probe); | ||
357 | list_del(&trace->list); | ||
358 | found_trace = trace; | ||
359 | break; | ||
360 | } | ||
361 | } | ||
362 | map.map_id = (found_trace) ? found_trace->id : -1; | ||
363 | mmio_trace_mapping(&map); | ||
364 | |||
365 | not_enabled: | ||
366 | spin_unlock_irq(&trace_lock); | ||
367 | if (found_trace) { | ||
368 | synchronize_rcu(); /* unregister_kmmio_probe() requirement */ | ||
369 | kfree(found_trace); | ||
370 | } | ||
371 | } | ||
372 | |||
373 | void mmiotrace_iounmap(volatile void __iomem *addr) | ||
374 | { | ||
375 | might_sleep(); | ||
376 | if (is_enabled()) /* recheck and proper locking in *_core() */ | ||
377 | iounmap_trace_core(addr); | ||
378 | } | ||
379 | |||
380 | static void clear_trace_list(void) | ||
381 | { | ||
382 | struct remap_trace *trace; | ||
383 | struct remap_trace *tmp; | ||
384 | |||
385 | /* | ||
386 | * No locking required, because the caller ensures we are in a | ||
387 | * critical section via mutex, and is_enabled() is false, | ||
388 | * i.e. nothing can traverse or modify this list. | ||
389 | * Caller also ensures is_enabled() cannot change. | ||
390 | */ | ||
391 | list_for_each_entry(trace, &trace_list, list) { | ||
392 | pr_notice(NAME "purging non-iounmapped " | ||
393 | "trace @0x%08lx, size 0x%lx.\n", | ||
394 | trace->probe.addr, trace->probe.len); | ||
395 | if (!nommiotrace) | ||
396 | unregister_kmmio_probe(&trace->probe); | ||
397 | } | ||
398 | synchronize_rcu(); /* unregister_kmmio_probe() requirement */ | ||
399 | |||
400 | list_for_each_entry_safe(trace, tmp, &trace_list, list) { | ||
401 | list_del(&trace->list); | ||
402 | kfree(trace); | ||
403 | } | ||
404 | } | ||
405 | |||
406 | #ifdef CONFIG_HOTPLUG_CPU | ||
407 | static cpumask_t downed_cpus; | ||
408 | |||
409 | static void enter_uniprocessor(void) | ||
410 | { | ||
411 | int cpu; | ||
412 | int err; | ||
413 | |||
414 | get_online_cpus(); | ||
415 | downed_cpus = cpu_online_map; | ||
416 | cpu_clear(first_cpu(cpu_online_map), downed_cpus); | ||
417 | if (num_online_cpus() > 1) | ||
418 | pr_notice(NAME "Disabling non-boot CPUs...\n"); | ||
419 | put_online_cpus(); | ||
420 | |||
421 | for_each_cpu_mask(cpu, downed_cpus) { | ||
422 | err = cpu_down(cpu); | ||
423 | if (!err) | ||
424 | pr_info(NAME "CPU%d is down.\n", cpu); | ||
425 | else | ||
426 | pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); | ||
427 | } | ||
428 | if (num_online_cpus() > 1) | ||
429 | pr_warning(NAME "multiple CPUs still online, " | ||
430 | "may miss events.\n"); | ||
431 | } | ||
432 | |||
433 | /* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, | ||
434 | but this whole function is ifdefed CONFIG_HOTPLUG_CPU */ | ||
435 | static void __ref leave_uniprocessor(void) | ||
436 | { | ||
437 | int cpu; | ||
438 | int err; | ||
439 | |||
440 | if (cpus_weight(downed_cpus) == 0) | ||
441 | return; | ||
442 | pr_notice(NAME "Re-enabling CPUs...\n"); | ||
443 | for_each_cpu_mask(cpu, downed_cpus) { | ||
444 | err = cpu_up(cpu); | ||
445 | if (!err) | ||
446 | pr_info(NAME "enabled CPU%d.\n", cpu); | ||
447 | else | ||
448 | pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err); | ||
449 | } | ||
450 | } | ||
451 | |||
452 | #else /* !CONFIG_HOTPLUG_CPU */ | ||
453 | static void enter_uniprocessor(void) | ||
454 | { | ||
455 | if (num_online_cpus() > 1) | ||
456 | pr_warning(NAME "multiple CPUs are online, may miss events. " | ||
457 | "Suggest booting with maxcpus=1 kernel argument.\n"); | ||
458 | } | ||
459 | |||
460 | static void leave_uniprocessor(void) | ||
461 | { | ||
462 | } | ||
463 | #endif | ||
464 | |||
465 | #if 0 /* XXX: out of order */ | ||
466 | static struct file_operations fops_marker = { | ||
467 | .owner = THIS_MODULE, | ||
468 | .write = write_marker | ||
469 | }; | ||
470 | #endif | ||
471 | |||
472 | void enable_mmiotrace(void) | ||
473 | { | ||
474 | mutex_lock(&mmiotrace_mutex); | ||
475 | if (is_enabled()) | ||
476 | goto out; | ||
477 | |||
478 | #if 0 /* XXX: tracing does not support text entries */ | ||
479 | marker_file = debugfs_create_file("marker", 0660, dir, NULL, | ||
480 | &fops_marker); | ||
481 | if (!marker_file) | ||
482 | pr_err(NAME "marker file creation failed.\n"); | ||
483 | #endif | ||
484 | |||
485 | if (nommiotrace) | ||
486 | pr_info(NAME "MMIO tracing disabled.\n"); | ||
487 | enter_uniprocessor(); | ||
488 | spin_lock_irq(&trace_lock); | ||
489 | atomic_inc(&mmiotrace_enabled); | ||
490 | spin_unlock_irq(&trace_lock); | ||
491 | pr_info(NAME "enabled.\n"); | ||
492 | out: | ||
493 | mutex_unlock(&mmiotrace_mutex); | ||
494 | } | ||
495 | |||
496 | void disable_mmiotrace(void) | ||
497 | { | ||
498 | mutex_lock(&mmiotrace_mutex); | ||
499 | if (!is_enabled()) | ||
500 | goto out; | ||
501 | |||
502 | spin_lock_irq(&trace_lock); | ||
503 | atomic_dec(&mmiotrace_enabled); | ||
504 | BUG_ON(is_enabled()); | ||
505 | spin_unlock_irq(&trace_lock); | ||
506 | |||
507 | clear_trace_list(); /* guarantees: no more kmmio callbacks */ | ||
508 | leave_uniprocessor(); | ||
509 | if (marker_file) { | ||
510 | debugfs_remove(marker_file); | ||
511 | marker_file = NULL; | ||
512 | } | ||
513 | |||
514 | pr_info(NAME "disabled.\n"); | ||
515 | out: | ||
516 | mutex_unlock(&mmiotrace_mutex); | ||
517 | } | ||
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c5066d519e5d..cebcbf152d46 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -20,37 +20,18 @@ | |||
20 | #include <asm/acpi.h> | 20 | #include <asm/acpi.h> |
21 | #include <asm/k8.h> | 21 | #include <asm/k8.h> |
22 | 22 | ||
23 | #ifndef Dprintk | ||
24 | #define Dprintk(x...) | ||
25 | #endif | ||
26 | |||
27 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | 23 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
28 | EXPORT_SYMBOL(node_data); | 24 | EXPORT_SYMBOL(node_data); |
29 | 25 | ||
30 | bootmem_data_t plat_node_bdata[MAX_NUMNODES]; | ||
31 | |||
32 | struct memnode memnode; | 26 | struct memnode memnode; |
33 | 27 | ||
34 | #ifdef CONFIG_SMP | ||
35 | int x86_cpu_to_node_map_init[NR_CPUS] = { | ||
36 | [0 ... NR_CPUS-1] = NUMA_NO_NODE | ||
37 | }; | ||
38 | void *x86_cpu_to_node_map_early_ptr; | ||
39 | EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); | ||
40 | #endif | ||
41 | DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; | ||
42 | EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map); | ||
43 | |||
44 | s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | 28 | s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { |
45 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | 29 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE |
46 | }; | 30 | }; |
47 | 31 | ||
48 | cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly; | ||
49 | EXPORT_SYMBOL(node_to_cpumask_map); | ||
50 | |||
51 | int numa_off __initdata; | 32 | int numa_off __initdata; |
52 | unsigned long __initdata nodemap_addr; | 33 | static unsigned long __initdata nodemap_addr; |
53 | unsigned long __initdata nodemap_size; | 34 | static unsigned long __initdata nodemap_size; |
54 | 35 | ||
55 | /* | 36 | /* |
56 | * Given a shift value, try to populate memnodemap[] | 37 | * Given a shift value, try to populate memnodemap[] |
@@ -98,8 +79,8 @@ static int __init allocate_cachealigned_memnodemap(void) | |||
98 | return 0; | 79 | return 0; |
99 | 80 | ||
100 | addr = 0x8000; | 81 | addr = 0x8000; |
101 | nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); | 82 | nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); |
102 | nodemap_addr = find_e820_area(addr, end_pfn<<PAGE_SHIFT, | 83 | nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, |
103 | nodemap_size, L1_CACHE_BYTES); | 84 | nodemap_size, L1_CACHE_BYTES); |
104 | if (nodemap_addr == -1UL) { | 85 | if (nodemap_addr == -1UL) { |
105 | printk(KERN_ERR | 86 | printk(KERN_ERR |
@@ -192,19 +173,19 @@ static void * __init early_node_mem(int nodeid, unsigned long start, | |||
192 | void __init setup_node_bootmem(int nodeid, unsigned long start, | 173 | void __init setup_node_bootmem(int nodeid, unsigned long start, |
193 | unsigned long end) | 174 | unsigned long end) |
194 | { | 175 | { |
195 | unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; | 176 | unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; |
196 | unsigned long bootmap_start, nodedata_phys; | 177 | unsigned long bootmap_start, nodedata_phys; |
197 | void *bootmap; | 178 | void *bootmap; |
198 | const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); | 179 | const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); |
199 | int nid; | 180 | int nid; |
200 | 181 | ||
201 | start = round_up(start, ZONE_ALIGN); | 182 | start = roundup(start, ZONE_ALIGN); |
202 | 183 | ||
203 | printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, | 184 | printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, |
204 | start, end); | 185 | start, end); |
205 | 186 | ||
206 | start_pfn = start >> PAGE_SHIFT; | 187 | start_pfn = start >> PAGE_SHIFT; |
207 | end_pfn = end >> PAGE_SHIFT; | 188 | last_pfn = end >> PAGE_SHIFT; |
208 | 189 | ||
209 | node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, | 190 | node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, |
210 | SMP_CACHE_BYTES); | 191 | SMP_CACHE_BYTES); |
@@ -215,9 +196,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, | |||
215 | nodedata_phys + pgdat_size - 1); | 196 | nodedata_phys + pgdat_size - 1); |
216 | 197 | ||
217 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); | 198 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); |
218 | NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; | 199 | NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; |
219 | NODE_DATA(nodeid)->node_start_pfn = start_pfn; | 200 | NODE_DATA(nodeid)->node_start_pfn = start_pfn; |
220 | NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; | 201 | NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; |
221 | 202 | ||
222 | /* | 203 | /* |
223 | * Find a place for the bootmem map | 204 | * Find a place for the bootmem map |
@@ -226,14 +207,14 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, | |||
226 | * early_node_mem will get that with find_e820_area instead | 207 | * early_node_mem will get that with find_e820_area instead |
227 | * of alloc_bootmem, that could clash with reserved range | 208 | * of alloc_bootmem, that could clash with reserved range |
228 | */ | 209 | */ |
229 | bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); | 210 | bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); |
230 | nid = phys_to_nid(nodedata_phys); | 211 | nid = phys_to_nid(nodedata_phys); |
231 | if (nid == nodeid) | 212 | if (nid == nodeid) |
232 | bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); | 213 | bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); |
233 | else | 214 | else |
234 | bootmap_start = round_up(start, PAGE_SIZE); | 215 | bootmap_start = roundup(start, PAGE_SIZE); |
235 | /* | 216 | /* |
236 | * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like | 217 | * SMP_CACHE_BYTES could be enough, but init_bootmem_node like |
237 | * to use that to align to PAGE_SIZE | 218 | * to use that to align to PAGE_SIZE |
238 | */ | 219 | */ |
239 | bootmap = early_node_mem(nodeid, bootmap_start, end, | 220 | bootmap = early_node_mem(nodeid, bootmap_start, end, |
@@ -248,7 +229,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, | |||
248 | 229 | ||
249 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | 230 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), |
250 | bootmap_start >> PAGE_SHIFT, | 231 | bootmap_start >> PAGE_SHIFT, |
251 | start_pfn, end_pfn); | 232 | start_pfn, last_pfn); |
252 | 233 | ||
253 | printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", | 234 | printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", |
254 | bootmap_start, bootmap_start + bootmap_size - 1, | 235 | bootmap_start, bootmap_start + bootmap_size - 1, |
@@ -309,7 +290,7 @@ void __init numa_init_array(void) | |||
309 | 290 | ||
310 | #ifdef CONFIG_NUMA_EMU | 291 | #ifdef CONFIG_NUMA_EMU |
311 | /* Numa emulation */ | 292 | /* Numa emulation */ |
312 | char *cmdline __initdata; | 293 | static char *cmdline __initdata; |
313 | 294 | ||
314 | /* | 295 | /* |
315 | * Setups up nid to range from addr to addr + size. If the end | 296 | * Setups up nid to range from addr to addr + size. If the end |
@@ -413,15 +394,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | |||
413 | } | 394 | } |
414 | 395 | ||
415 | /* | 396 | /* |
416 | * Sets up the system RAM area from start_pfn to end_pfn according to the | 397 | * Sets up the system RAM area from start_pfn to last_pfn according to the |
417 | * numa=fake command-line option. | 398 | * numa=fake command-line option. |
418 | */ | 399 | */ |
419 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | 400 | static struct bootnode nodes[MAX_NUMNODES] __initdata; |
420 | 401 | ||
421 | static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) | 402 | static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn) |
422 | { | 403 | { |
423 | u64 size, addr = start_pfn << PAGE_SHIFT; | 404 | u64 size, addr = start_pfn << PAGE_SHIFT; |
424 | u64 max_addr = end_pfn << PAGE_SHIFT; | 405 | u64 max_addr = last_pfn << PAGE_SHIFT; |
425 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; | 406 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; |
426 | 407 | ||
427 | memset(&nodes, 0, sizeof(nodes)); | 408 | memset(&nodes, 0, sizeof(nodes)); |
@@ -527,7 +508,7 @@ out: | |||
527 | } | 508 | } |
528 | #endif /* CONFIG_NUMA_EMU */ | 509 | #endif /* CONFIG_NUMA_EMU */ |
529 | 510 | ||
530 | void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | 511 | void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) |
531 | { | 512 | { |
532 | int i; | 513 | int i; |
533 | 514 | ||
@@ -535,7 +516,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | |||
535 | nodes_clear(node_online_map); | 516 | nodes_clear(node_online_map); |
536 | 517 | ||
537 | #ifdef CONFIG_NUMA_EMU | 518 | #ifdef CONFIG_NUMA_EMU |
538 | if (cmdline && !numa_emulation(start_pfn, end_pfn)) | 519 | if (cmdline && !numa_emulation(start_pfn, last_pfn)) |
539 | return; | 520 | return; |
540 | nodes_clear(node_possible_map); | 521 | nodes_clear(node_possible_map); |
541 | nodes_clear(node_online_map); | 522 | nodes_clear(node_online_map); |
@@ -543,7 +524,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | |||
543 | 524 | ||
544 | #ifdef CONFIG_ACPI_NUMA | 525 | #ifdef CONFIG_ACPI_NUMA |
545 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | 526 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, |
546 | end_pfn << PAGE_SHIFT)) | 527 | last_pfn << PAGE_SHIFT)) |
547 | return; | 528 | return; |
548 | nodes_clear(node_possible_map); | 529 | nodes_clear(node_possible_map); |
549 | nodes_clear(node_online_map); | 530 | nodes_clear(node_online_map); |
@@ -551,7 +532,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | |||
551 | 532 | ||
552 | #ifdef CONFIG_K8_NUMA | 533 | #ifdef CONFIG_K8_NUMA |
553 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, | 534 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, |
554 | end_pfn<<PAGE_SHIFT)) | 535 | last_pfn<<PAGE_SHIFT)) |
555 | return; | 536 | return; |
556 | nodes_clear(node_possible_map); | 537 | nodes_clear(node_possible_map); |
557 | nodes_clear(node_online_map); | 538 | nodes_clear(node_online_map); |
@@ -561,7 +542,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | |||
561 | 542 | ||
562 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", | 543 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", |
563 | start_pfn << PAGE_SHIFT, | 544 | start_pfn << PAGE_SHIFT, |
564 | end_pfn << PAGE_SHIFT); | 545 | last_pfn << PAGE_SHIFT); |
565 | /* setup dummy node covering all memory */ | 546 | /* setup dummy node covering all memory */ |
566 | memnode_shift = 63; | 547 | memnode_shift = 63; |
567 | memnodemap = memnode.embedded_map; | 548 | memnodemap = memnode.embedded_map; |
@@ -570,29 +551,8 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) | |||
570 | node_set(0, node_possible_map); | 551 | node_set(0, node_possible_map); |
571 | for (i = 0; i < NR_CPUS; i++) | 552 | for (i = 0; i < NR_CPUS; i++) |
572 | numa_set_node(i, 0); | 553 | numa_set_node(i, 0); |
573 | /* cpumask_of_cpu() may not be available during early startup */ | 554 | e820_register_active_regions(0, start_pfn, last_pfn); |
574 | memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); | 555 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); |
575 | cpu_set(0, node_to_cpumask_map[0]); | ||
576 | e820_register_active_regions(0, start_pfn, end_pfn); | ||
577 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); | ||
578 | } | ||
579 | |||
580 | __cpuinit void numa_add_cpu(int cpu) | ||
581 | { | ||
582 | set_bit(cpu, | ||
583 | (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
584 | } | ||
585 | |||
586 | void __cpuinit numa_set_node(int cpu, int node) | ||
587 | { | ||
588 | int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; | ||
589 | |||
590 | if(cpu_to_node_map) | ||
591 | cpu_to_node_map[cpu] = node; | ||
592 | else if(per_cpu_offset(cpu)) | ||
593 | per_cpu(x86_cpu_to_node_map, cpu) = node; | ||
594 | else | ||
595 | Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); | ||
596 | } | 556 | } |
597 | 557 | ||
598 | unsigned long __init numa_free_all_bootmem(void) | 558 | unsigned long __init numa_free_all_bootmem(void) |
@@ -613,7 +573,7 @@ void __init paging_init(void) | |||
613 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | 573 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); |
614 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; | 574 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; |
615 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; | 575 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; |
616 | max_zone_pfns[ZONE_NORMAL] = end_pfn; | 576 | max_zone_pfns[ZONE_NORMAL] = max_pfn; |
617 | 577 | ||
618 | sparse_memory_present_with_active_regions(MAX_NUMNODES); | 578 | sparse_memory_present_with_active_regions(MAX_NUMNODES); |
619 | sparse_init(); | 579 | sparse_init(); |
@@ -641,6 +601,7 @@ static __init int numa_setup(char *opt) | |||
641 | } | 601 | } |
642 | early_param("numa", numa_setup); | 602 | early_param("numa", numa_setup); |
643 | 603 | ||
604 | #ifdef CONFIG_NUMA | ||
644 | /* | 605 | /* |
645 | * Setup early cpu_to_node. | 606 | * Setup early cpu_to_node. |
646 | * | 607 | * |
@@ -652,14 +613,19 @@ early_param("numa", numa_setup); | |||
652 | * is already initialized in a round robin manner at numa_init_array, | 613 | * is already initialized in a round robin manner at numa_init_array, |
653 | * prior to this call, and this initialization is good enough | 614 | * prior to this call, and this initialization is good enough |
654 | * for the fake NUMA cases. | 615 | * for the fake NUMA cases. |
616 | * | ||
617 | * Called before the per_cpu areas are setup. | ||
655 | */ | 618 | */ |
656 | void __init init_cpu_to_node(void) | 619 | void __init init_cpu_to_node(void) |
657 | { | 620 | { |
658 | int i; | 621 | int cpu; |
622 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); | ||
659 | 623 | ||
660 | for (i = 0; i < NR_CPUS; i++) { | 624 | BUG_ON(cpu_to_apicid == NULL); |
625 | |||
626 | for_each_possible_cpu(cpu) { | ||
661 | int node; | 627 | int node; |
662 | u16 apicid = x86_cpu_to_apicid_init[i]; | 628 | u16 apicid = cpu_to_apicid[cpu]; |
663 | 629 | ||
664 | if (apicid == BAD_APICID) | 630 | if (apicid == BAD_APICID) |
665 | continue; | 631 | continue; |
@@ -668,8 +634,9 @@ void __init init_cpu_to_node(void) | |||
668 | continue; | 634 | continue; |
669 | if (!node_online(node)) | 635 | if (!node_online(node)) |
670 | continue; | 636 | continue; |
671 | numa_set_node(i, node); | 637 | numa_set_node(cpu, node); |
672 | } | 638 | } |
673 | } | 639 | } |
640 | #endif | ||
674 | 641 | ||
675 | 642 | ||
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 75f1b109aae8..e1d106909218 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c | |||
@@ -1,8 +1,8 @@ | |||
1 | /* | 1 | /* |
2 | * self test for change_page_attr. | 2 | * self test for change_page_attr. |
3 | * | 3 | * |
4 | * Clears the global bit on random pages in the direct mapping, then reverts | 4 | * Clears the a test pte bit on random pages in the direct mapping, |
5 | * and compares page tables forwards and afterwards. | 5 | * then reverts and compares page tables forwards and afterwards. |
6 | */ | 6 | */ |
7 | #include <linux/bootmem.h> | 7 | #include <linux/bootmem.h> |
8 | #include <linux/kthread.h> | 8 | #include <linux/kthread.h> |
@@ -32,6 +32,13 @@ enum { | |||
32 | GPS = (1<<30) | 32 | GPS = (1<<30) |
33 | }; | 33 | }; |
34 | 34 | ||
35 | #define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST) | ||
36 | |||
37 | static int pte_testbit(pte_t pte) | ||
38 | { | ||
39 | return pte_flags(pte) & _PAGE_UNUSED1; | ||
40 | } | ||
41 | |||
35 | struct split_state { | 42 | struct split_state { |
36 | long lpg, gpg, spg, exec; | 43 | long lpg, gpg, spg, exec; |
37 | long min_exec, max_exec; | 44 | long min_exec, max_exec; |
@@ -111,6 +118,7 @@ static int pageattr_test(void) | |||
111 | unsigned int level; | 118 | unsigned int level; |
112 | int i, k; | 119 | int i, k; |
113 | int err; | 120 | int err; |
121 | unsigned long test_addr; | ||
114 | 122 | ||
115 | if (print) | 123 | if (print) |
116 | printk(KERN_INFO "CPA self-test:\n"); | 124 | printk(KERN_INFO "CPA self-test:\n"); |
@@ -165,15 +173,15 @@ static int pageattr_test(void) | |||
165 | continue; | 173 | continue; |
166 | } | 174 | } |
167 | 175 | ||
168 | err = change_page_attr_clear(addr[i], len[i], | 176 | test_addr = addr[i]; |
169 | __pgprot(_PAGE_GLOBAL)); | 177 | err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0); |
170 | if (err < 0) { | 178 | if (err < 0) { |
171 | printk(KERN_ERR "CPA %d failed %d\n", i, err); | 179 | printk(KERN_ERR "CPA %d failed %d\n", i, err); |
172 | failed++; | 180 | failed++; |
173 | } | 181 | } |
174 | 182 | ||
175 | pte = lookup_address(addr[i], &level); | 183 | pte = lookup_address(addr[i], &level); |
176 | if (!pte || pte_global(*pte) || pte_huge(*pte)) { | 184 | if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) { |
177 | printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], | 185 | printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], |
178 | pte ? (u64)pte_val(*pte) : 0ULL); | 186 | pte ? (u64)pte_val(*pte) : 0ULL); |
179 | failed++; | 187 | failed++; |
@@ -198,14 +206,14 @@ static int pageattr_test(void) | |||
198 | failed++; | 206 | failed++; |
199 | continue; | 207 | continue; |
200 | } | 208 | } |
201 | err = change_page_attr_set(addr[i], len[i], | 209 | test_addr = addr[i]; |
202 | __pgprot(_PAGE_GLOBAL)); | 210 | err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0); |
203 | if (err < 0) { | 211 | if (err < 0) { |
204 | printk(KERN_ERR "CPA reverting failed: %d\n", err); | 212 | printk(KERN_ERR "CPA reverting failed: %d\n", err); |
205 | failed++; | 213 | failed++; |
206 | } | 214 | } |
207 | pte = lookup_address(addr[i], &level); | 215 | pte = lookup_address(addr[i], &level); |
208 | if (!pte || !pte_global(*pte)) { | 216 | if (!pte || pte_testbit(*pte)) { |
209 | printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", | 217 | printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", |
210 | addr[i], pte ? (u64)pte_val(*pte) : 0ULL); | 218 | addr[i], pte ? (u64)pte_val(*pte) : 0ULL); |
211 | failed++; | 219 | failed++; |
@@ -216,8 +224,7 @@ static int pageattr_test(void) | |||
216 | failed += print_split(&sc); | 224 | failed += print_split(&sc); |
217 | 225 | ||
218 | if (failed) { | 226 | if (failed) { |
219 | printk(KERN_ERR "NOT PASSED. Please report.\n"); | 227 | WARN(1, KERN_ERR "NOT PASSED. Please report.\n"); |
220 | WARN_ON(1); | ||
221 | return -EINVAL; | 228 | return -EINVAL; |
222 | } else { | 229 | } else { |
223 | if (print) | 230 | if (print) |
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 60bcb5b6a37e..a9ec89c3fbca 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -25,15 +25,68 @@ | |||
25 | * The current flushing context - we pass it instead of 5 arguments: | 25 | * The current flushing context - we pass it instead of 5 arguments: |
26 | */ | 26 | */ |
27 | struct cpa_data { | 27 | struct cpa_data { |
28 | unsigned long vaddr; | 28 | unsigned long *vaddr; |
29 | pgprot_t mask_set; | 29 | pgprot_t mask_set; |
30 | pgprot_t mask_clr; | 30 | pgprot_t mask_clr; |
31 | int numpages; | 31 | int numpages; |
32 | int flushtlb; | 32 | int flags; |
33 | unsigned long pfn; | 33 | unsigned long pfn; |
34 | unsigned force_split : 1; | 34 | unsigned force_split : 1; |
35 | int curpage; | ||
35 | }; | 36 | }; |
36 | 37 | ||
38 | /* | ||
39 | * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) | ||
40 | * using cpa_lock. So that we don't allow any other cpu, with stale large tlb | ||
41 | * entries change the page attribute in parallel to some other cpu | ||
42 | * splitting a large page entry along with changing the attribute. | ||
43 | */ | ||
44 | static DEFINE_SPINLOCK(cpa_lock); | ||
45 | |||
46 | #define CPA_FLUSHTLB 1 | ||
47 | #define CPA_ARRAY 2 | ||
48 | |||
49 | #ifdef CONFIG_PROC_FS | ||
50 | static unsigned long direct_pages_count[PG_LEVEL_NUM]; | ||
51 | |||
52 | void update_page_count(int level, unsigned long pages) | ||
53 | { | ||
54 | unsigned long flags; | ||
55 | |||
56 | /* Protect against CPA */ | ||
57 | spin_lock_irqsave(&pgd_lock, flags); | ||
58 | direct_pages_count[level] += pages; | ||
59 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
60 | } | ||
61 | |||
62 | static void split_page_count(int level) | ||
63 | { | ||
64 | direct_pages_count[level]--; | ||
65 | direct_pages_count[level - 1] += PTRS_PER_PTE; | ||
66 | } | ||
67 | |||
68 | int arch_report_meminfo(char *page) | ||
69 | { | ||
70 | int n = sprintf(page, "DirectMap4k: %8lu kB\n", | ||
71 | direct_pages_count[PG_LEVEL_4K] << 2); | ||
72 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) | ||
73 | n += sprintf(page + n, "DirectMap2M: %8lu kB\n", | ||
74 | direct_pages_count[PG_LEVEL_2M] << 11); | ||
75 | #else | ||
76 | n += sprintf(page + n, "DirectMap4M: %8lu kB\n", | ||
77 | direct_pages_count[PG_LEVEL_2M] << 12); | ||
78 | #endif | ||
79 | #ifdef CONFIG_X86_64 | ||
80 | if (direct_gbpages) | ||
81 | n += sprintf(page + n, "DirectMap1G: %8lu kB\n", | ||
82 | direct_pages_count[PG_LEVEL_1G] << 20); | ||
83 | #endif | ||
84 | return n; | ||
85 | } | ||
86 | #else | ||
87 | static inline void split_page_count(int level) { } | ||
88 | #endif | ||
89 | |||
37 | #ifdef CONFIG_X86_64 | 90 | #ifdef CONFIG_X86_64 |
38 | 91 | ||
39 | static inline unsigned long highmap_start_pfn(void) | 92 | static inline unsigned long highmap_start_pfn(void) |
@@ -43,7 +96,7 @@ static inline unsigned long highmap_start_pfn(void) | |||
43 | 96 | ||
44 | static inline unsigned long highmap_end_pfn(void) | 97 | static inline unsigned long highmap_end_pfn(void) |
45 | { | 98 | { |
46 | return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; | 99 | return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; |
47 | } | 100 | } |
48 | 101 | ||
49 | #endif | 102 | #endif |
@@ -106,7 +159,7 @@ static void cpa_flush_all(unsigned long cache) | |||
106 | { | 159 | { |
107 | BUG_ON(irqs_disabled()); | 160 | BUG_ON(irqs_disabled()); |
108 | 161 | ||
109 | on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); | 162 | on_each_cpu(__cpa_flush_all, (void *) cache, 1); |
110 | } | 163 | } |
111 | 164 | ||
112 | static void __cpa_flush_range(void *arg) | 165 | static void __cpa_flush_range(void *arg) |
@@ -127,7 +180,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) | |||
127 | BUG_ON(irqs_disabled()); | 180 | BUG_ON(irqs_disabled()); |
128 | WARN_ON(PAGE_ALIGN(start) != start); | 181 | WARN_ON(PAGE_ALIGN(start) != start); |
129 | 182 | ||
130 | on_each_cpu(__cpa_flush_range, NULL, 1, 1); | 183 | on_each_cpu(__cpa_flush_range, NULL, 1); |
131 | 184 | ||
132 | if (!cache) | 185 | if (!cache) |
133 | return; | 186 | return; |
@@ -149,6 +202,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) | |||
149 | } | 202 | } |
150 | } | 203 | } |
151 | 204 | ||
205 | static void cpa_flush_array(unsigned long *start, int numpages, int cache) | ||
206 | { | ||
207 | unsigned int i, level; | ||
208 | unsigned long *addr; | ||
209 | |||
210 | BUG_ON(irqs_disabled()); | ||
211 | |||
212 | on_each_cpu(__cpa_flush_range, NULL, 1); | ||
213 | |||
214 | if (!cache) | ||
215 | return; | ||
216 | |||
217 | /* 4M threshold */ | ||
218 | if (numpages >= 1024) { | ||
219 | if (boot_cpu_data.x86_model >= 4) | ||
220 | wbinvd(); | ||
221 | return; | ||
222 | } | ||
223 | /* | ||
224 | * We only need to flush on one CPU, | ||
225 | * clflush is a MESI-coherent instruction that | ||
226 | * will cause all other CPUs to flush the same | ||
227 | * cachelines: | ||
228 | */ | ||
229 | for (i = 0, addr = start; i < numpages; i++, addr++) { | ||
230 | pte_t *pte = lookup_address(*addr, &level); | ||
231 | |||
232 | /* | ||
233 | * Only flush present addresses: | ||
234 | */ | ||
235 | if (pte && (pte_val(*pte) & _PAGE_PRESENT)) | ||
236 | clflush_cache_range((void *) *addr, PAGE_SIZE); | ||
237 | } | ||
238 | } | ||
239 | |||
152 | /* | 240 | /* |
153 | * Certain areas of memory on x86 require very specific protection flags, | 241 | * Certain areas of memory on x86 require very specific protection flags, |
154 | * for example the BIOS area or kernel text. Callers don't always get this | 242 | * for example the BIOS area or kernel text. Callers don't always get this |
@@ -227,6 +315,7 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) | |||
227 | 315 | ||
228 | return pte_offset_kernel(pmd, address); | 316 | return pte_offset_kernel(pmd, address); |
229 | } | 317 | } |
318 | EXPORT_SYMBOL_GPL(lookup_address); | ||
230 | 319 | ||
231 | /* | 320 | /* |
232 | * Set the new pmd in all the pgds we know about: | 321 | * Set the new pmd in all the pgds we know about: |
@@ -356,7 +445,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, | |||
356 | */ | 445 | */ |
357 | new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); | 446 | new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); |
358 | __set_pmd_pte(kpte, address, new_pte); | 447 | __set_pmd_pte(kpte, address, new_pte); |
359 | cpa->flushtlb = 1; | 448 | cpa->flags |= CPA_FLUSHTLB; |
360 | do_split = 0; | 449 | do_split = 0; |
361 | } | 450 | } |
362 | 451 | ||
@@ -366,84 +455,6 @@ out_unlock: | |||
366 | return do_split; | 455 | return do_split; |
367 | } | 456 | } |
368 | 457 | ||
369 | static LIST_HEAD(page_pool); | ||
370 | static unsigned long pool_size, pool_pages, pool_low; | ||
371 | static unsigned long pool_used, pool_failed; | ||
372 | |||
373 | static void cpa_fill_pool(struct page **ret) | ||
374 | { | ||
375 | gfp_t gfp = GFP_KERNEL; | ||
376 | unsigned long flags; | ||
377 | struct page *p; | ||
378 | |||
379 | /* | ||
380 | * Avoid recursion (on debug-pagealloc) and also signal | ||
381 | * our priority to get to these pagetables: | ||
382 | */ | ||
383 | if (current->flags & PF_MEMALLOC) | ||
384 | return; | ||
385 | current->flags |= PF_MEMALLOC; | ||
386 | |||
387 | /* | ||
388 | * Allocate atomically from atomic contexts: | ||
389 | */ | ||
390 | if (in_atomic() || irqs_disabled() || debug_pagealloc) | ||
391 | gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; | ||
392 | |||
393 | while (pool_pages < pool_size || (ret && !*ret)) { | ||
394 | p = alloc_pages(gfp, 0); | ||
395 | if (!p) { | ||
396 | pool_failed++; | ||
397 | break; | ||
398 | } | ||
399 | /* | ||
400 | * If the call site needs a page right now, provide it: | ||
401 | */ | ||
402 | if (ret && !*ret) { | ||
403 | *ret = p; | ||
404 | continue; | ||
405 | } | ||
406 | spin_lock_irqsave(&pgd_lock, flags); | ||
407 | list_add(&p->lru, &page_pool); | ||
408 | pool_pages++; | ||
409 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
410 | } | ||
411 | |||
412 | current->flags &= ~PF_MEMALLOC; | ||
413 | } | ||
414 | |||
415 | #define SHIFT_MB (20 - PAGE_SHIFT) | ||
416 | #define ROUND_MB_GB ((1 << 10) - 1) | ||
417 | #define SHIFT_MB_GB 10 | ||
418 | #define POOL_PAGES_PER_GB 16 | ||
419 | |||
420 | void __init cpa_init(void) | ||
421 | { | ||
422 | struct sysinfo si; | ||
423 | unsigned long gb; | ||
424 | |||
425 | si_meminfo(&si); | ||
426 | /* | ||
427 | * Calculate the number of pool pages: | ||
428 | * | ||
429 | * Convert totalram (nr of pages) to MiB and round to the next | ||
430 | * GiB. Shift MiB to Gib and multiply the result by | ||
431 | * POOL_PAGES_PER_GB: | ||
432 | */ | ||
433 | if (debug_pagealloc) { | ||
434 | gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; | ||
435 | pool_size = POOL_PAGES_PER_GB * gb; | ||
436 | } else { | ||
437 | pool_size = 1; | ||
438 | } | ||
439 | pool_low = pool_size; | ||
440 | |||
441 | cpa_fill_pool(NULL); | ||
442 | printk(KERN_DEBUG | ||
443 | "CPA: page pool initialized %lu of %lu pages preallocated\n", | ||
444 | pool_pages, pool_size); | ||
445 | } | ||
446 | |||
447 | static int split_large_page(pte_t *kpte, unsigned long address) | 458 | static int split_large_page(pte_t *kpte, unsigned long address) |
448 | { | 459 | { |
449 | unsigned long flags, pfn, pfninc = 1; | 460 | unsigned long flags, pfn, pfninc = 1; |
@@ -452,28 +463,15 @@ static int split_large_page(pte_t *kpte, unsigned long address) | |||
452 | pgprot_t ref_prot; | 463 | pgprot_t ref_prot; |
453 | struct page *base; | 464 | struct page *base; |
454 | 465 | ||
455 | /* | 466 | if (!debug_pagealloc) |
456 | * Get a page from the pool. The pool list is protected by the | 467 | spin_unlock(&cpa_lock); |
457 | * pgd_lock, which we have to take anyway for the split | 468 | base = alloc_pages(GFP_KERNEL, 0); |
458 | * operation: | 469 | if (!debug_pagealloc) |
459 | */ | 470 | spin_lock(&cpa_lock); |
460 | spin_lock_irqsave(&pgd_lock, flags); | 471 | if (!base) |
461 | if (list_empty(&page_pool)) { | 472 | return -ENOMEM; |
462 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
463 | base = NULL; | ||
464 | cpa_fill_pool(&base); | ||
465 | if (!base) | ||
466 | return -ENOMEM; | ||
467 | spin_lock_irqsave(&pgd_lock, flags); | ||
468 | } else { | ||
469 | base = list_first_entry(&page_pool, struct page, lru); | ||
470 | list_del(&base->lru); | ||
471 | pool_pages--; | ||
472 | |||
473 | if (pool_pages < pool_low) | ||
474 | pool_low = pool_pages; | ||
475 | } | ||
476 | 473 | ||
474 | spin_lock_irqsave(&pgd_lock, flags); | ||
477 | /* | 475 | /* |
478 | * Check for races, another CPU might have split this page | 476 | * Check for races, another CPU might have split this page |
479 | * up for us already: | 477 | * up for us already: |
@@ -500,6 +498,16 @@ static int split_large_page(pte_t *kpte, unsigned long address) | |||
500 | for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) | 498 | for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) |
501 | set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); | 499 | set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); |
502 | 500 | ||
501 | if (address >= (unsigned long)__va(0) && | ||
502 | address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) | ||
503 | split_page_count(level); | ||
504 | |||
505 | #ifdef CONFIG_X86_64 | ||
506 | if (address >= (unsigned long)__va(1UL<<32) && | ||
507 | address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT)) | ||
508 | split_page_count(level); | ||
509 | #endif | ||
510 | |||
503 | /* | 511 | /* |
504 | * Install the new, split up pagetable. Important details here: | 512 | * Install the new, split up pagetable. Important details here: |
505 | * | 513 | * |
@@ -520,11 +528,8 @@ out_unlock: | |||
520 | * If we dropped out via the lookup_address check under | 528 | * If we dropped out via the lookup_address check under |
521 | * pgd_lock then stick the page back into the pool: | 529 | * pgd_lock then stick the page back into the pool: |
522 | */ | 530 | */ |
523 | if (base) { | 531 | if (base) |
524 | list_add(&base->lru, &page_pool); | 532 | __free_page(base); |
525 | pool_pages++; | ||
526 | } else | ||
527 | pool_used++; | ||
528 | spin_unlock_irqrestore(&pgd_lock, flags); | 533 | spin_unlock_irqrestore(&pgd_lock, flags); |
529 | 534 | ||
530 | return 0; | 535 | return 0; |
@@ -532,11 +537,16 @@ out_unlock: | |||
532 | 537 | ||
533 | static int __change_page_attr(struct cpa_data *cpa, int primary) | 538 | static int __change_page_attr(struct cpa_data *cpa, int primary) |
534 | { | 539 | { |
535 | unsigned long address = cpa->vaddr; | 540 | unsigned long address; |
536 | int do_split, err; | 541 | int do_split, err; |
537 | unsigned int level; | 542 | unsigned int level; |
538 | pte_t *kpte, old_pte; | 543 | pte_t *kpte, old_pte; |
539 | 544 | ||
545 | if (cpa->flags & CPA_ARRAY) | ||
546 | address = cpa->vaddr[cpa->curpage]; | ||
547 | else | ||
548 | address = *cpa->vaddr; | ||
549 | |||
540 | repeat: | 550 | repeat: |
541 | kpte = lookup_address(address, &level); | 551 | kpte = lookup_address(address, &level); |
542 | if (!kpte) | 552 | if (!kpte) |
@@ -546,10 +556,9 @@ repeat: | |||
546 | if (!pte_val(old_pte)) { | 556 | if (!pte_val(old_pte)) { |
547 | if (!primary) | 557 | if (!primary) |
548 | return 0; | 558 | return 0; |
549 | printk(KERN_WARNING "CPA: called for zero pte. " | 559 | WARN(1, KERN_WARNING "CPA: called for zero pte. " |
550 | "vaddr = %lx cpa->vaddr = %lx\n", address, | 560 | "vaddr = %lx cpa->vaddr = %lx\n", address, |
551 | cpa->vaddr); | 561 | *cpa->vaddr); |
552 | WARN_ON(1); | ||
553 | return -EINVAL; | 562 | return -EINVAL; |
554 | } | 563 | } |
555 | 564 | ||
@@ -575,7 +584,7 @@ repeat: | |||
575 | */ | 584 | */ |
576 | if (pte_val(old_pte) != pte_val(new_pte)) { | 585 | if (pte_val(old_pte) != pte_val(new_pte)) { |
577 | set_pte_atomic(kpte, new_pte); | 586 | set_pte_atomic(kpte, new_pte); |
578 | cpa->flushtlb = 1; | 587 | cpa->flags |= CPA_FLUSHTLB; |
579 | } | 588 | } |
580 | cpa->numpages = 1; | 589 | cpa->numpages = 1; |
581 | return 0; | 590 | return 0; |
@@ -599,7 +608,25 @@ repeat: | |||
599 | */ | 608 | */ |
600 | err = split_large_page(kpte, address); | 609 | err = split_large_page(kpte, address); |
601 | if (!err) { | 610 | if (!err) { |
602 | cpa->flushtlb = 1; | 611 | /* |
612 | * Do a global flush tlb after splitting the large page | ||
613 | * and before we do the actual change page attribute in the PTE. | ||
614 | * | ||
615 | * With out this, we violate the TLB application note, that says | ||
616 | * "The TLBs may contain both ordinary and large-page | ||
617 | * translations for a 4-KByte range of linear addresses. This | ||
618 | * may occur if software modifies the paging structures so that | ||
619 | * the page size used for the address range changes. If the two | ||
620 | * translations differ with respect to page frame or attributes | ||
621 | * (e.g., permissions), processor behavior is undefined and may | ||
622 | * be implementation-specific." | ||
623 | * | ||
624 | * We do this global tlb flush inside the cpa_lock, so that we | ||
625 | * don't allow any other cpu, with stale tlb entries change the | ||
626 | * page attribute in parallel, that also falls into the | ||
627 | * just split large page entry. | ||
628 | */ | ||
629 | flush_tlb_all(); | ||
603 | goto repeat; | 630 | goto repeat; |
604 | } | 631 | } |
605 | 632 | ||
@@ -612,19 +639,37 @@ static int cpa_process_alias(struct cpa_data *cpa) | |||
612 | { | 639 | { |
613 | struct cpa_data alias_cpa; | 640 | struct cpa_data alias_cpa; |
614 | int ret = 0; | 641 | int ret = 0; |
642 | unsigned long temp_cpa_vaddr, vaddr; | ||
615 | 643 | ||
616 | if (cpa->pfn > max_pfn_mapped) | 644 | if (cpa->pfn >= max_pfn_mapped) |
617 | return 0; | 645 | return 0; |
618 | 646 | ||
647 | #ifdef CONFIG_X86_64 | ||
648 | if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT))) | ||
649 | return 0; | ||
650 | #endif | ||
619 | /* | 651 | /* |
620 | * No need to redo, when the primary call touched the direct | 652 | * No need to redo, when the primary call touched the direct |
621 | * mapping already: | 653 | * mapping already: |
622 | */ | 654 | */ |
623 | if (!within(cpa->vaddr, PAGE_OFFSET, | 655 | if (cpa->flags & CPA_ARRAY) |
624 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { | 656 | vaddr = cpa->vaddr[cpa->curpage]; |
657 | else | ||
658 | vaddr = *cpa->vaddr; | ||
659 | |||
660 | if (!(within(vaddr, PAGE_OFFSET, | ||
661 | PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) | ||
662 | #ifdef CONFIG_X86_64 | ||
663 | || within(vaddr, PAGE_OFFSET + (1UL<<32), | ||
664 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) | ||
665 | #endif | ||
666 | )) { | ||
625 | 667 | ||
626 | alias_cpa = *cpa; | 668 | alias_cpa = *cpa; |
627 | alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); | 669 | temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); |
670 | alias_cpa.vaddr = &temp_cpa_vaddr; | ||
671 | alias_cpa.flags &= ~CPA_ARRAY; | ||
672 | |||
628 | 673 | ||
629 | ret = __change_page_attr_set_clr(&alias_cpa, 0); | 674 | ret = __change_page_attr_set_clr(&alias_cpa, 0); |
630 | } | 675 | } |
@@ -636,7 +681,7 @@ static int cpa_process_alias(struct cpa_data *cpa) | |||
636 | * No need to redo, when the primary call touched the high | 681 | * No need to redo, when the primary call touched the high |
637 | * mapping already: | 682 | * mapping already: |
638 | */ | 683 | */ |
639 | if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) | 684 | if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) |
640 | return 0; | 685 | return 0; |
641 | 686 | ||
642 | /* | 687 | /* |
@@ -647,8 +692,9 @@ static int cpa_process_alias(struct cpa_data *cpa) | |||
647 | return 0; | 692 | return 0; |
648 | 693 | ||
649 | alias_cpa = *cpa; | 694 | alias_cpa = *cpa; |
650 | alias_cpa.vaddr = | 695 | temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; |
651 | (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; | 696 | alias_cpa.vaddr = &temp_cpa_vaddr; |
697 | alias_cpa.flags &= ~CPA_ARRAY; | ||
652 | 698 | ||
653 | /* | 699 | /* |
654 | * The high mapping range is imprecise, so ignore the return value. | 700 | * The high mapping range is imprecise, so ignore the return value. |
@@ -668,8 +714,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) | |||
668 | * preservation check. | 714 | * preservation check. |
669 | */ | 715 | */ |
670 | cpa->numpages = numpages; | 716 | cpa->numpages = numpages; |
717 | /* for array changes, we can't use large page */ | ||
718 | if (cpa->flags & CPA_ARRAY) | ||
719 | cpa->numpages = 1; | ||
671 | 720 | ||
721 | if (!debug_pagealloc) | ||
722 | spin_lock(&cpa_lock); | ||
672 | ret = __change_page_attr(cpa, checkalias); | 723 | ret = __change_page_attr(cpa, checkalias); |
724 | if (!debug_pagealloc) | ||
725 | spin_unlock(&cpa_lock); | ||
673 | if (ret) | 726 | if (ret) |
674 | return ret; | 727 | return ret; |
675 | 728 | ||
@@ -686,7 +739,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) | |||
686 | */ | 739 | */ |
687 | BUG_ON(cpa->numpages > numpages); | 740 | BUG_ON(cpa->numpages > numpages); |
688 | numpages -= cpa->numpages; | 741 | numpages -= cpa->numpages; |
689 | cpa->vaddr += cpa->numpages * PAGE_SIZE; | 742 | if (cpa->flags & CPA_ARRAY) |
743 | cpa->curpage++; | ||
744 | else | ||
745 | *cpa->vaddr += cpa->numpages * PAGE_SIZE; | ||
746 | |||
690 | } | 747 | } |
691 | return 0; | 748 | return 0; |
692 | } | 749 | } |
@@ -697,9 +754,9 @@ static inline int cache_attr(pgprot_t attr) | |||
697 | (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); | 754 | (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); |
698 | } | 755 | } |
699 | 756 | ||
700 | static int change_page_attr_set_clr(unsigned long addr, int numpages, | 757 | static int change_page_attr_set_clr(unsigned long *addr, int numpages, |
701 | pgprot_t mask_set, pgprot_t mask_clr, | 758 | pgprot_t mask_set, pgprot_t mask_clr, |
702 | int force_split) | 759 | int force_split, int array) |
703 | { | 760 | { |
704 | struct cpa_data cpa; | 761 | struct cpa_data cpa; |
705 | int ret, cache, checkalias; | 762 | int ret, cache, checkalias; |
@@ -714,21 +771,38 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, | |||
714 | return 0; | 771 | return 0; |
715 | 772 | ||
716 | /* Ensure we are PAGE_SIZE aligned */ | 773 | /* Ensure we are PAGE_SIZE aligned */ |
717 | if (addr & ~PAGE_MASK) { | 774 | if (!array) { |
718 | addr &= PAGE_MASK; | 775 | if (*addr & ~PAGE_MASK) { |
719 | /* | 776 | *addr &= PAGE_MASK; |
720 | * People should not be passing in unaligned addresses: | 777 | /* |
721 | */ | 778 | * People should not be passing in unaligned addresses: |
722 | WARN_ON_ONCE(1); | 779 | */ |
780 | WARN_ON_ONCE(1); | ||
781 | } | ||
782 | } else { | ||
783 | int i; | ||
784 | for (i = 0; i < numpages; i++) { | ||
785 | if (addr[i] & ~PAGE_MASK) { | ||
786 | addr[i] &= PAGE_MASK; | ||
787 | WARN_ON_ONCE(1); | ||
788 | } | ||
789 | } | ||
723 | } | 790 | } |
724 | 791 | ||
792 | /* Must avoid aliasing mappings in the highmem code */ | ||
793 | kmap_flush_unused(); | ||
794 | |||
725 | cpa.vaddr = addr; | 795 | cpa.vaddr = addr; |
726 | cpa.numpages = numpages; | 796 | cpa.numpages = numpages; |
727 | cpa.mask_set = mask_set; | 797 | cpa.mask_set = mask_set; |
728 | cpa.mask_clr = mask_clr; | 798 | cpa.mask_clr = mask_clr; |
729 | cpa.flushtlb = 0; | 799 | cpa.flags = 0; |
800 | cpa.curpage = 0; | ||
730 | cpa.force_split = force_split; | 801 | cpa.force_split = force_split; |
731 | 802 | ||
803 | if (array) | ||
804 | cpa.flags |= CPA_ARRAY; | ||
805 | |||
732 | /* No alias checking for _NX bit modifications */ | 806 | /* No alias checking for _NX bit modifications */ |
733 | checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; | 807 | checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; |
734 | 808 | ||
@@ -737,7 +811,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, | |||
737 | /* | 811 | /* |
738 | * Check whether we really changed something: | 812 | * Check whether we really changed something: |
739 | */ | 813 | */ |
740 | if (!cpa.flushtlb) | 814 | if (!(cpa.flags & CPA_FLUSHTLB)) |
741 | goto out; | 815 | goto out; |
742 | 816 | ||
743 | /* | 817 | /* |
@@ -752,27 +826,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, | |||
752 | * error case we fall back to cpa_flush_all (which uses | 826 | * error case we fall back to cpa_flush_all (which uses |
753 | * wbindv): | 827 | * wbindv): |
754 | */ | 828 | */ |
755 | if (!ret && cpu_has_clflush) | 829 | if (!ret && cpu_has_clflush) { |
756 | cpa_flush_range(addr, numpages, cache); | 830 | if (cpa.flags & CPA_ARRAY) |
757 | else | 831 | cpa_flush_array(addr, numpages, cache); |
832 | else | ||
833 | cpa_flush_range(*addr, numpages, cache); | ||
834 | } else | ||
758 | cpa_flush_all(cache); | 835 | cpa_flush_all(cache); |
759 | 836 | ||
760 | out: | 837 | out: |
761 | cpa_fill_pool(NULL); | ||
762 | |||
763 | return ret; | 838 | return ret; |
764 | } | 839 | } |
765 | 840 | ||
766 | static inline int change_page_attr_set(unsigned long addr, int numpages, | 841 | static inline int change_page_attr_set(unsigned long *addr, int numpages, |
767 | pgprot_t mask) | 842 | pgprot_t mask, int array) |
768 | { | 843 | { |
769 | return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); | 844 | return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, |
845 | array); | ||
770 | } | 846 | } |
771 | 847 | ||
772 | static inline int change_page_attr_clear(unsigned long addr, int numpages, | 848 | static inline int change_page_attr_clear(unsigned long *addr, int numpages, |
773 | pgprot_t mask) | 849 | pgprot_t mask, int array) |
774 | { | 850 | { |
775 | return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); | 851 | return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, |
852 | array); | ||
776 | } | 853 | } |
777 | 854 | ||
778 | int _set_memory_uc(unsigned long addr, int numpages) | 855 | int _set_memory_uc(unsigned long addr, int numpages) |
@@ -780,8 +857,8 @@ int _set_memory_uc(unsigned long addr, int numpages) | |||
780 | /* | 857 | /* |
781 | * for now UC MINUS. see comments in ioremap_nocache() | 858 | * for now UC MINUS. see comments in ioremap_nocache() |
782 | */ | 859 | */ |
783 | return change_page_attr_set(addr, numpages, | 860 | return change_page_attr_set(&addr, numpages, |
784 | __pgprot(_PAGE_CACHE_UC_MINUS)); | 861 | __pgprot(_PAGE_CACHE_UC_MINUS), 0); |
785 | } | 862 | } |
786 | 863 | ||
787 | int set_memory_uc(unsigned long addr, int numpages) | 864 | int set_memory_uc(unsigned long addr, int numpages) |
@@ -789,7 +866,7 @@ int set_memory_uc(unsigned long addr, int numpages) | |||
789 | /* | 866 | /* |
790 | * for now UC MINUS. see comments in ioremap_nocache() | 867 | * for now UC MINUS. see comments in ioremap_nocache() |
791 | */ | 868 | */ |
792 | if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, | 869 | if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, |
793 | _PAGE_CACHE_UC_MINUS, NULL)) | 870 | _PAGE_CACHE_UC_MINUS, NULL)) |
794 | return -EINVAL; | 871 | return -EINVAL; |
795 | 872 | ||
@@ -797,18 +874,56 @@ int set_memory_uc(unsigned long addr, int numpages) | |||
797 | } | 874 | } |
798 | EXPORT_SYMBOL(set_memory_uc); | 875 | EXPORT_SYMBOL(set_memory_uc); |
799 | 876 | ||
877 | int set_memory_array_uc(unsigned long *addr, int addrinarray) | ||
878 | { | ||
879 | unsigned long start; | ||
880 | unsigned long end; | ||
881 | int i; | ||
882 | /* | ||
883 | * for now UC MINUS. see comments in ioremap_nocache() | ||
884 | */ | ||
885 | for (i = 0; i < addrinarray; i++) { | ||
886 | start = __pa(addr[i]); | ||
887 | for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { | ||
888 | if (end != __pa(addr[i + 1])) | ||
889 | break; | ||
890 | i++; | ||
891 | } | ||
892 | if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) | ||
893 | goto out; | ||
894 | } | ||
895 | |||
896 | return change_page_attr_set(addr, addrinarray, | ||
897 | __pgprot(_PAGE_CACHE_UC_MINUS), 1); | ||
898 | out: | ||
899 | for (i = 0; i < addrinarray; i++) { | ||
900 | unsigned long tmp = __pa(addr[i]); | ||
901 | |||
902 | if (tmp == start) | ||
903 | break; | ||
904 | for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { | ||
905 | if (end != __pa(addr[i + 1])) | ||
906 | break; | ||
907 | i++; | ||
908 | } | ||
909 | free_memtype(tmp, end); | ||
910 | } | ||
911 | return -EINVAL; | ||
912 | } | ||
913 | EXPORT_SYMBOL(set_memory_array_uc); | ||
914 | |||
800 | int _set_memory_wc(unsigned long addr, int numpages) | 915 | int _set_memory_wc(unsigned long addr, int numpages) |
801 | { | 916 | { |
802 | return change_page_attr_set(addr, numpages, | 917 | return change_page_attr_set(&addr, numpages, |
803 | __pgprot(_PAGE_CACHE_WC)); | 918 | __pgprot(_PAGE_CACHE_WC), 0); |
804 | } | 919 | } |
805 | 920 | ||
806 | int set_memory_wc(unsigned long addr, int numpages) | 921 | int set_memory_wc(unsigned long addr, int numpages) |
807 | { | 922 | { |
808 | if (!pat_wc_enabled) | 923 | if (!pat_enabled) |
809 | return set_memory_uc(addr, numpages); | 924 | return set_memory_uc(addr, numpages); |
810 | 925 | ||
811 | if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, | 926 | if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, |
812 | _PAGE_CACHE_WC, NULL)) | 927 | _PAGE_CACHE_WC, NULL)) |
813 | return -EINVAL; | 928 | return -EINVAL; |
814 | 929 | ||
@@ -818,49 +933,71 @@ EXPORT_SYMBOL(set_memory_wc); | |||
818 | 933 | ||
819 | int _set_memory_wb(unsigned long addr, int numpages) | 934 | int _set_memory_wb(unsigned long addr, int numpages) |
820 | { | 935 | { |
821 | return change_page_attr_clear(addr, numpages, | 936 | return change_page_attr_clear(&addr, numpages, |
822 | __pgprot(_PAGE_CACHE_MASK)); | 937 | __pgprot(_PAGE_CACHE_MASK), 0); |
823 | } | 938 | } |
824 | 939 | ||
825 | int set_memory_wb(unsigned long addr, int numpages) | 940 | int set_memory_wb(unsigned long addr, int numpages) |
826 | { | 941 | { |
827 | free_memtype(addr, addr + numpages * PAGE_SIZE); | 942 | free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); |
828 | 943 | ||
829 | return _set_memory_wb(addr, numpages); | 944 | return _set_memory_wb(addr, numpages); |
830 | } | 945 | } |
831 | EXPORT_SYMBOL(set_memory_wb); | 946 | EXPORT_SYMBOL(set_memory_wb); |
832 | 947 | ||
948 | int set_memory_array_wb(unsigned long *addr, int addrinarray) | ||
949 | { | ||
950 | int i; | ||
951 | |||
952 | for (i = 0; i < addrinarray; i++) { | ||
953 | unsigned long start = __pa(addr[i]); | ||
954 | unsigned long end; | ||
955 | |||
956 | for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { | ||
957 | if (end != __pa(addr[i + 1])) | ||
958 | break; | ||
959 | i++; | ||
960 | } | ||
961 | free_memtype(start, end); | ||
962 | } | ||
963 | return change_page_attr_clear(addr, addrinarray, | ||
964 | __pgprot(_PAGE_CACHE_MASK), 1); | ||
965 | } | ||
966 | EXPORT_SYMBOL(set_memory_array_wb); | ||
967 | |||
833 | int set_memory_x(unsigned long addr, int numpages) | 968 | int set_memory_x(unsigned long addr, int numpages) |
834 | { | 969 | { |
835 | return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); | 970 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); |
836 | } | 971 | } |
837 | EXPORT_SYMBOL(set_memory_x); | 972 | EXPORT_SYMBOL(set_memory_x); |
838 | 973 | ||
839 | int set_memory_nx(unsigned long addr, int numpages) | 974 | int set_memory_nx(unsigned long addr, int numpages) |
840 | { | 975 | { |
841 | return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); | 976 | return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); |
842 | } | 977 | } |
843 | EXPORT_SYMBOL(set_memory_nx); | 978 | EXPORT_SYMBOL(set_memory_nx); |
844 | 979 | ||
845 | int set_memory_ro(unsigned long addr, int numpages) | 980 | int set_memory_ro(unsigned long addr, int numpages) |
846 | { | 981 | { |
847 | return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); | 982 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); |
848 | } | 983 | } |
984 | EXPORT_SYMBOL_GPL(set_memory_ro); | ||
849 | 985 | ||
850 | int set_memory_rw(unsigned long addr, int numpages) | 986 | int set_memory_rw(unsigned long addr, int numpages) |
851 | { | 987 | { |
852 | return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); | 988 | return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); |
853 | } | 989 | } |
990 | EXPORT_SYMBOL_GPL(set_memory_rw); | ||
854 | 991 | ||
855 | int set_memory_np(unsigned long addr, int numpages) | 992 | int set_memory_np(unsigned long addr, int numpages) |
856 | { | 993 | { |
857 | return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); | 994 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); |
858 | } | 995 | } |
859 | 996 | ||
860 | int set_memory_4k(unsigned long addr, int numpages) | 997 | int set_memory_4k(unsigned long addr, int numpages) |
861 | { | 998 | { |
862 | return change_page_attr_set_clr(addr, numpages, __pgprot(0), | 999 | return change_page_attr_set_clr(&addr, numpages, __pgprot(0), |
863 | __pgprot(0), 1); | 1000 | __pgprot(0), 1, 0); |
864 | } | 1001 | } |
865 | 1002 | ||
866 | int set_pages_uc(struct page *page, int numpages) | 1003 | int set_pages_uc(struct page *page, int numpages) |
@@ -913,22 +1050,38 @@ int set_pages_rw(struct page *page, int numpages) | |||
913 | 1050 | ||
914 | static int __set_pages_p(struct page *page, int numpages) | 1051 | static int __set_pages_p(struct page *page, int numpages) |
915 | { | 1052 | { |
916 | struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), | 1053 | unsigned long tempaddr = (unsigned long) page_address(page); |
1054 | struct cpa_data cpa = { .vaddr = &tempaddr, | ||
917 | .numpages = numpages, | 1055 | .numpages = numpages, |
918 | .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), | 1056 | .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), |
919 | .mask_clr = __pgprot(0)}; | 1057 | .mask_clr = __pgprot(0), |
1058 | .flags = 0}; | ||
920 | 1059 | ||
921 | return __change_page_attr_set_clr(&cpa, 1); | 1060 | /* |
1061 | * No alias checking needed for setting present flag. otherwise, | ||
1062 | * we may need to break large pages for 64-bit kernel text | ||
1063 | * mappings (this adds to complexity if we want to do this from | ||
1064 | * atomic context especially). Let's keep it simple! | ||
1065 | */ | ||
1066 | return __change_page_attr_set_clr(&cpa, 0); | ||
922 | } | 1067 | } |
923 | 1068 | ||
924 | static int __set_pages_np(struct page *page, int numpages) | 1069 | static int __set_pages_np(struct page *page, int numpages) |
925 | { | 1070 | { |
926 | struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), | 1071 | unsigned long tempaddr = (unsigned long) page_address(page); |
1072 | struct cpa_data cpa = { .vaddr = &tempaddr, | ||
927 | .numpages = numpages, | 1073 | .numpages = numpages, |
928 | .mask_set = __pgprot(0), | 1074 | .mask_set = __pgprot(0), |
929 | .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; | 1075 | .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), |
1076 | .flags = 0}; | ||
930 | 1077 | ||
931 | return __change_page_attr_set_clr(&cpa, 1); | 1078 | /* |
1079 | * No alias checking needed for setting not present flag. otherwise, | ||
1080 | * we may need to break large pages for 64-bit kernel text | ||
1081 | * mappings (this adds to complexity if we want to do this from | ||
1082 | * atomic context especially). Let's keep it simple! | ||
1083 | */ | ||
1084 | return __change_page_attr_set_clr(&cpa, 0); | ||
932 | } | 1085 | } |
933 | 1086 | ||
934 | void kernel_map_pages(struct page *page, int numpages, int enable) | 1087 | void kernel_map_pages(struct page *page, int numpages, int enable) |
@@ -948,11 +1101,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) | |||
948 | 1101 | ||
949 | /* | 1102 | /* |
950 | * The return value is ignored as the calls cannot fail. | 1103 | * The return value is ignored as the calls cannot fail. |
951 | * Large pages are kept enabled at boot time, and are | 1104 | * Large pages for identity mappings are not used at boot time |
952 | * split up quickly with DEBUG_PAGEALLOC. If a splitup | 1105 | * and hence no memory allocations during large page split. |
953 | * fails here (due to temporary memory shortage) no damage | ||
954 | * is done because we just keep the largepage intact up | ||
955 | * to the next attempt when it will likely be split up: | ||
956 | */ | 1106 | */ |
957 | if (enable) | 1107 | if (enable) |
958 | __set_pages_p(page, numpages); | 1108 | __set_pages_p(page, numpages); |
@@ -964,53 +1114,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) | |||
964 | * but that can deadlock->flush only current cpu: | 1114 | * but that can deadlock->flush only current cpu: |
965 | */ | 1115 | */ |
966 | __flush_tlb_all(); | 1116 | __flush_tlb_all(); |
967 | |||
968 | /* | ||
969 | * Try to refill the page pool here. We can do this only after | ||
970 | * the tlb flush. | ||
971 | */ | ||
972 | cpa_fill_pool(NULL); | ||
973 | } | ||
974 | |||
975 | #ifdef CONFIG_DEBUG_FS | ||
976 | static int dpa_show(struct seq_file *m, void *v) | ||
977 | { | ||
978 | seq_puts(m, "DEBUG_PAGEALLOC\n"); | ||
979 | seq_printf(m, "pool_size : %lu\n", pool_size); | ||
980 | seq_printf(m, "pool_pages : %lu\n", pool_pages); | ||
981 | seq_printf(m, "pool_low : %lu\n", pool_low); | ||
982 | seq_printf(m, "pool_used : %lu\n", pool_used); | ||
983 | seq_printf(m, "pool_failed : %lu\n", pool_failed); | ||
984 | |||
985 | return 0; | ||
986 | } | ||
987 | |||
988 | static int dpa_open(struct inode *inode, struct file *filp) | ||
989 | { | ||
990 | return single_open(filp, dpa_show, NULL); | ||
991 | } | 1117 | } |
992 | 1118 | ||
993 | static const struct file_operations dpa_fops = { | ||
994 | .open = dpa_open, | ||
995 | .read = seq_read, | ||
996 | .llseek = seq_lseek, | ||
997 | .release = single_release, | ||
998 | }; | ||
999 | |||
1000 | static int __init debug_pagealloc_proc_init(void) | ||
1001 | { | ||
1002 | struct dentry *de; | ||
1003 | |||
1004 | de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL, | ||
1005 | &dpa_fops); | ||
1006 | if (!de) | ||
1007 | return -ENOMEM; | ||
1008 | |||
1009 | return 0; | ||
1010 | } | ||
1011 | __initcall(debug_pagealloc_proc_init); | ||
1012 | #endif | ||
1013 | |||
1014 | #ifdef CONFIG_HIBERNATION | 1119 | #ifdef CONFIG_HIBERNATION |
1015 | 1120 | ||
1016 | bool kernel_page_present(struct page *page) | 1121 | bool kernel_page_present(struct page *page) |
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 06b7a1c90fb8..738fd0f24958 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c | |||
@@ -7,30 +7,32 @@ | |||
7 | * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. | 7 | * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/mm.h> | 10 | #include <linux/seq_file.h> |
11 | #include <linux/bootmem.h> | ||
12 | #include <linux/debugfs.h> | ||
11 | #include <linux/kernel.h> | 13 | #include <linux/kernel.h> |
12 | #include <linux/gfp.h> | 14 | #include <linux/gfp.h> |
15 | #include <linux/mm.h> | ||
13 | #include <linux/fs.h> | 16 | #include <linux/fs.h> |
14 | #include <linux/bootmem.h> | ||
15 | 17 | ||
16 | #include <asm/msr.h> | 18 | #include <asm/cacheflush.h> |
17 | #include <asm/tlbflush.h> | ||
18 | #include <asm/processor.h> | 19 | #include <asm/processor.h> |
19 | #include <asm/page.h> | 20 | #include <asm/tlbflush.h> |
20 | #include <asm/pgtable.h> | 21 | #include <asm/pgtable.h> |
21 | #include <asm/pat.h> | ||
22 | #include <asm/e820.h> | ||
23 | #include <asm/cacheflush.h> | ||
24 | #include <asm/fcntl.h> | 22 | #include <asm/fcntl.h> |
23 | #include <asm/e820.h> | ||
25 | #include <asm/mtrr.h> | 24 | #include <asm/mtrr.h> |
25 | #include <asm/page.h> | ||
26 | #include <asm/msr.h> | ||
27 | #include <asm/pat.h> | ||
26 | #include <asm/io.h> | 28 | #include <asm/io.h> |
27 | 29 | ||
28 | #ifdef CONFIG_X86_PAT | 30 | #ifdef CONFIG_X86_PAT |
29 | int __read_mostly pat_wc_enabled = 1; | 31 | int __read_mostly pat_enabled = 1; |
30 | 32 | ||
31 | void __cpuinit pat_disable(char *reason) | 33 | void __cpuinit pat_disable(char *reason) |
32 | { | 34 | { |
33 | pat_wc_enabled = 0; | 35 | pat_enabled = 0; |
34 | printk(KERN_INFO "%s\n", reason); | 36 | printk(KERN_INFO "%s\n", reason); |
35 | } | 37 | } |
36 | 38 | ||
@@ -42,6 +44,20 @@ static int __init nopat(char *str) | |||
42 | early_param("nopat", nopat); | 44 | early_param("nopat", nopat); |
43 | #endif | 45 | #endif |
44 | 46 | ||
47 | |||
48 | static int debug_enable; | ||
49 | |||
50 | static int __init pat_debug_setup(char *str) | ||
51 | { | ||
52 | debug_enable = 1; | ||
53 | return 0; | ||
54 | } | ||
55 | __setup("debugpat", pat_debug_setup); | ||
56 | |||
57 | #define dprintk(fmt, arg...) \ | ||
58 | do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0) | ||
59 | |||
60 | |||
45 | static u64 __read_mostly boot_pat_state; | 61 | static u64 __read_mostly boot_pat_state; |
46 | 62 | ||
47 | enum { | 63 | enum { |
@@ -53,24 +69,25 @@ enum { | |||
53 | PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ | 69 | PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */ |
54 | }; | 70 | }; |
55 | 71 | ||
56 | #define PAT(x,y) ((u64)PAT_ ## y << ((x)*8)) | 72 | #define PAT(x, y) ((u64)PAT_ ## y << ((x)*8)) |
57 | 73 | ||
58 | void pat_init(void) | 74 | void pat_init(void) |
59 | { | 75 | { |
60 | u64 pat; | 76 | u64 pat; |
61 | 77 | ||
62 | if (!pat_wc_enabled) | 78 | if (!pat_enabled) |
63 | return; | 79 | return; |
64 | 80 | ||
65 | /* Paranoia check. */ | 81 | /* Paranoia check. */ |
66 | if (!cpu_has_pat) { | 82 | if (!cpu_has_pat && boot_pat_state) { |
67 | printk(KERN_ERR "PAT enabled, but CPU feature cleared\n"); | ||
68 | /* | 83 | /* |
69 | * Panic if this happens on the secondary CPU, and we | 84 | * If this happens we are on a secondary CPU, but |
70 | * switched to PAT on the boot CPU. We have no way to | 85 | * switched to PAT on the boot CPU. We have no way to |
71 | * undo PAT. | 86 | * undo PAT. |
72 | */ | 87 | */ |
73 | BUG_ON(boot_pat_state); | 88 | printk(KERN_ERR "PAT enabled, " |
89 | "but not supported by secondary CPU\n"); | ||
90 | BUG(); | ||
74 | } | 91 | } |
75 | 92 | ||
76 | /* Set PWT to Write-Combining. All other bits stay the same */ | 93 | /* Set PWT to Write-Combining. All other bits stay the same */ |
@@ -86,8 +103,8 @@ void pat_init(void) | |||
86 | * 011 UC _PAGE_CACHE_UC | 103 | * 011 UC _PAGE_CACHE_UC |
87 | * PAT bit unused | 104 | * PAT bit unused |
88 | */ | 105 | */ |
89 | pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) | | 106 | pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | |
90 | PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC); | 107 | PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC); |
91 | 108 | ||
92 | /* Boot CPU check */ | 109 | /* Boot CPU check */ |
93 | if (!boot_pat_state) | 110 | if (!boot_pat_state) |
@@ -103,11 +120,11 @@ void pat_init(void) | |||
103 | static char *cattr_name(unsigned long flags) | 120 | static char *cattr_name(unsigned long flags) |
104 | { | 121 | { |
105 | switch (flags & _PAGE_CACHE_MASK) { | 122 | switch (flags & _PAGE_CACHE_MASK) { |
106 | case _PAGE_CACHE_UC: return "uncached"; | 123 | case _PAGE_CACHE_UC: return "uncached"; |
107 | case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; | 124 | case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; |
108 | case _PAGE_CACHE_WB: return "write-back"; | 125 | case _PAGE_CACHE_WB: return "write-back"; |
109 | case _PAGE_CACHE_WC: return "write-combining"; | 126 | case _PAGE_CACHE_WC: return "write-combining"; |
110 | default: return "broken"; | 127 | default: return "broken"; |
111 | } | 128 | } |
112 | } | 129 | } |
113 | 130 | ||
@@ -129,14 +146,14 @@ static char *cattr_name(unsigned long flags) | |||
129 | */ | 146 | */ |
130 | 147 | ||
131 | struct memtype { | 148 | struct memtype { |
132 | u64 start; | 149 | u64 start; |
133 | u64 end; | 150 | u64 end; |
134 | unsigned long type; | 151 | unsigned long type; |
135 | struct list_head nd; | 152 | struct list_head nd; |
136 | }; | 153 | }; |
137 | 154 | ||
138 | static LIST_HEAD(memtype_list); | 155 | static LIST_HEAD(memtype_list); |
139 | static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ | 156 | static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ |
140 | 157 | ||
141 | /* | 158 | /* |
142 | * Does intersection of PAT memory type and MTRR memory type and returns | 159 | * Does intersection of PAT memory type and MTRR memory type and returns |
@@ -145,47 +162,113 @@ static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ | |||
145 | * The intersection is based on "Effective Memory Type" tables in IA-32 | 162 | * The intersection is based on "Effective Memory Type" tables in IA-32 |
146 | * SDM vol 3a | 163 | * SDM vol 3a |
147 | */ | 164 | */ |
148 | static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, | 165 | static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) |
149 | unsigned long *ret_prot) | ||
150 | { | 166 | { |
151 | unsigned long pat_type; | ||
152 | u8 mtrr_type; | ||
153 | |||
154 | pat_type = prot & _PAGE_CACHE_MASK; | ||
155 | prot &= (~_PAGE_CACHE_MASK); | ||
156 | |||
157 | /* | ||
158 | * We return the PAT request directly for types where PAT takes | ||
159 | * precedence with respect to MTRR and for UC_MINUS. | ||
160 | * Consistency checks with other PAT requests is done later | ||
161 | * while going through memtype list. | ||
162 | */ | ||
163 | if (pat_type == _PAGE_CACHE_WC) { | ||
164 | *ret_prot = prot | _PAGE_CACHE_WC; | ||
165 | return 0; | ||
166 | } else if (pat_type == _PAGE_CACHE_UC_MINUS) { | ||
167 | *ret_prot = prot | _PAGE_CACHE_UC_MINUS; | ||
168 | return 0; | ||
169 | } else if (pat_type == _PAGE_CACHE_UC) { | ||
170 | *ret_prot = prot | _PAGE_CACHE_UC; | ||
171 | return 0; | ||
172 | } | ||
173 | |||
174 | /* | 167 | /* |
175 | * Look for MTRR hint to get the effective type in case where PAT | 168 | * Look for MTRR hint to get the effective type in case where PAT |
176 | * request is for WB. | 169 | * request is for WB. |
177 | */ | 170 | */ |
178 | mtrr_type = mtrr_type_lookup(start, end); | 171 | if (req_type == _PAGE_CACHE_WB) { |
172 | u8 mtrr_type; | ||
173 | |||
174 | mtrr_type = mtrr_type_lookup(start, end); | ||
175 | if (mtrr_type == MTRR_TYPE_UNCACHABLE) | ||
176 | return _PAGE_CACHE_UC; | ||
177 | if (mtrr_type == MTRR_TYPE_WRCOMB) | ||
178 | return _PAGE_CACHE_WC; | ||
179 | } | ||
179 | 180 | ||
180 | if (mtrr_type == MTRR_TYPE_UNCACHABLE) { | 181 | return req_type; |
181 | *ret_prot = prot | _PAGE_CACHE_UC; | 182 | } |
182 | } else if (mtrr_type == MTRR_TYPE_WRCOMB) { | 183 | |
183 | *ret_prot = prot | _PAGE_CACHE_WC; | 184 | static int |
184 | } else { | 185 | chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) |
185 | *ret_prot = prot | _PAGE_CACHE_WB; | 186 | { |
187 | if (new->type != entry->type) { | ||
188 | if (type) { | ||
189 | new->type = entry->type; | ||
190 | *type = entry->type; | ||
191 | } else | ||
192 | goto conflict; | ||
186 | } | 193 | } |
187 | 194 | ||
195 | /* check overlaps with more than one entry in the list */ | ||
196 | list_for_each_entry_continue(entry, &memtype_list, nd) { | ||
197 | if (new->end <= entry->start) | ||
198 | break; | ||
199 | else if (new->type != entry->type) | ||
200 | goto conflict; | ||
201 | } | ||
188 | return 0; | 202 | return 0; |
203 | |||
204 | conflict: | ||
205 | printk(KERN_INFO "%s:%d conflicting memory types " | ||
206 | "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start, | ||
207 | new->end, cattr_name(new->type), cattr_name(entry->type)); | ||
208 | return -EBUSY; | ||
209 | } | ||
210 | |||
211 | static struct memtype *cached_entry; | ||
212 | static u64 cached_start; | ||
213 | |||
214 | /* | ||
215 | * For RAM pages, mark the pages as non WB memory type using | ||
216 | * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or | ||
217 | * set_memory_wc() on a RAM page at a time before marking it as WB again. | ||
218 | * This is ok, because only one driver will be owning the page and | ||
219 | * doing set_memory_*() calls. | ||
220 | * | ||
221 | * For now, we use PageNonWB to track that the RAM page is being mapped | ||
222 | * as non WB. In future, we will have to use one more flag | ||
223 | * (or some other mechanism in page_struct) to distinguish between | ||
224 | * UC and WC mapping. | ||
225 | */ | ||
226 | static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, | ||
227 | unsigned long *new_type) | ||
228 | { | ||
229 | struct page *page; | ||
230 | u64 pfn, end_pfn; | ||
231 | |||
232 | for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { | ||
233 | page = pfn_to_page(pfn); | ||
234 | if (page_mapped(page) || PageNonWB(page)) | ||
235 | goto out; | ||
236 | |||
237 | SetPageNonWB(page); | ||
238 | } | ||
239 | return 0; | ||
240 | |||
241 | out: | ||
242 | end_pfn = pfn; | ||
243 | for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { | ||
244 | page = pfn_to_page(pfn); | ||
245 | ClearPageNonWB(page); | ||
246 | } | ||
247 | |||
248 | return -EINVAL; | ||
249 | } | ||
250 | |||
251 | static int free_ram_pages_type(u64 start, u64 end) | ||
252 | { | ||
253 | struct page *page; | ||
254 | u64 pfn, end_pfn; | ||
255 | |||
256 | for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { | ||
257 | page = pfn_to_page(pfn); | ||
258 | if (page_mapped(page) || !PageNonWB(page)) | ||
259 | goto out; | ||
260 | |||
261 | ClearPageNonWB(page); | ||
262 | } | ||
263 | return 0; | ||
264 | |||
265 | out: | ||
266 | end_pfn = pfn; | ||
267 | for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { | ||
268 | page = pfn_to_page(pfn); | ||
269 | SetPageNonWB(page); | ||
270 | } | ||
271 | return -EINVAL; | ||
189 | } | 272 | } |
190 | 273 | ||
191 | /* | 274 | /* |
@@ -198,37 +281,37 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, | |||
198 | * req_type will have a special case value '-1', when requester want to inherit | 281 | * req_type will have a special case value '-1', when requester want to inherit |
199 | * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. | 282 | * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. |
200 | * | 283 | * |
201 | * If ret_type is NULL, function will return an error if it cannot reserve the | 284 | * If new_type is NULL, function will return an error if it cannot reserve the |
202 | * region with req_type. If ret_type is non-null, function will return | 285 | * region with req_type. If new_type is non-NULL, function will return |
203 | * available type in ret_type in case of no error. In case of any error | 286 | * available type in new_type in case of no error. In case of any error |
204 | * it will return a negative return value. | 287 | * it will return a negative return value. |
205 | */ | 288 | */ |
206 | int reserve_memtype(u64 start, u64 end, unsigned long req_type, | 289 | int reserve_memtype(u64 start, u64 end, unsigned long req_type, |
207 | unsigned long *ret_type) | 290 | unsigned long *new_type) |
208 | { | 291 | { |
209 | struct memtype *new_entry = NULL; | 292 | struct memtype *new, *entry; |
210 | struct memtype *parse; | ||
211 | unsigned long actual_type; | 293 | unsigned long actual_type; |
294 | struct list_head *where; | ||
295 | int is_range_ram; | ||
212 | int err = 0; | 296 | int err = 0; |
213 | 297 | ||
214 | /* Only track when pat_wc_enabled */ | 298 | BUG_ON(start >= end); /* end is exclusive */ |
215 | if (!pat_wc_enabled) { | 299 | |
300 | if (!pat_enabled) { | ||
216 | /* This is identical to page table setting without PAT */ | 301 | /* This is identical to page table setting without PAT */ |
217 | if (ret_type) { | 302 | if (new_type) { |
218 | if (req_type == -1) { | 303 | if (req_type == -1) |
219 | *ret_type = _PAGE_CACHE_WB; | 304 | *new_type = _PAGE_CACHE_WB; |
220 | } else { | 305 | else |
221 | *ret_type = req_type; | 306 | *new_type = req_type & _PAGE_CACHE_MASK; |
222 | } | ||
223 | } | 307 | } |
224 | return 0; | 308 | return 0; |
225 | } | 309 | } |
226 | 310 | ||
227 | /* Low ISA region is always mapped WB in page table. No need to track */ | 311 | /* Low ISA region is always mapped WB in page table. No need to track */ |
228 | if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) { | 312 | if (is_ISA_range(start, end - 1)) { |
229 | if (ret_type) | 313 | if (new_type) |
230 | *ret_type = _PAGE_CACHE_WB; | 314 | *new_type = _PAGE_CACHE_WB; |
231 | |||
232 | return 0; | 315 | return 0; |
233 | } | 316 | } |
234 | 317 | ||
@@ -241,206 +324,133 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, | |||
241 | */ | 324 | */ |
242 | u8 mtrr_type = mtrr_type_lookup(start, end); | 325 | u8 mtrr_type = mtrr_type_lookup(start, end); |
243 | 326 | ||
244 | if (mtrr_type == MTRR_TYPE_WRBACK) { | 327 | if (mtrr_type == MTRR_TYPE_WRBACK) |
245 | req_type = _PAGE_CACHE_WB; | ||
246 | actual_type = _PAGE_CACHE_WB; | 328 | actual_type = _PAGE_CACHE_WB; |
247 | } else { | 329 | else |
248 | req_type = _PAGE_CACHE_UC_MINUS; | ||
249 | actual_type = _PAGE_CACHE_UC_MINUS; | 330 | actual_type = _PAGE_CACHE_UC_MINUS; |
250 | } | ||
251 | } else { | 331 | } else { |
252 | req_type &= _PAGE_CACHE_MASK; | 332 | actual_type = pat_x_mtrr_type(start, end, |
253 | err = pat_x_mtrr_type(start, end, req_type, &actual_type); | 333 | req_type & _PAGE_CACHE_MASK); |
254 | } | 334 | } |
255 | 335 | ||
256 | if (err) { | 336 | is_range_ram = pagerange_is_ram(start, end); |
257 | if (ret_type) | 337 | if (is_range_ram == 1) |
258 | *ret_type = actual_type; | 338 | return reserve_ram_pages_type(start, end, req_type, new_type); |
259 | 339 | else if (is_range_ram < 0) | |
260 | return -EINVAL; | 340 | return -EINVAL; |
261 | } | ||
262 | 341 | ||
263 | new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); | 342 | new = kmalloc(sizeof(struct memtype), GFP_KERNEL); |
264 | if (!new_entry) | 343 | if (!new) |
265 | return -ENOMEM; | 344 | return -ENOMEM; |
266 | 345 | ||
267 | new_entry->start = start; | 346 | new->start = start; |
268 | new_entry->end = end; | 347 | new->end = end; |
269 | new_entry->type = actual_type; | 348 | new->type = actual_type; |
270 | 349 | ||
271 | if (ret_type) | 350 | if (new_type) |
272 | *ret_type = actual_type; | 351 | *new_type = actual_type; |
273 | 352 | ||
274 | spin_lock(&memtype_lock); | 353 | spin_lock(&memtype_lock); |
275 | 354 | ||
276 | /* Search for existing mapping that overlaps the current range */ | 355 | if (cached_entry && start >= cached_start) |
277 | list_for_each_entry(parse, &memtype_list, nd) { | 356 | entry = cached_entry; |
278 | struct memtype *saved_ptr; | 357 | else |
358 | entry = list_entry(&memtype_list, struct memtype, nd); | ||
279 | 359 | ||
280 | if (parse->start >= end) { | 360 | /* Search for existing mapping that overlaps the current range */ |
281 | pr_debug("New Entry\n"); | 361 | where = NULL; |
282 | list_add(&new_entry->nd, parse->nd.prev); | 362 | list_for_each_entry_continue(entry, &memtype_list, nd) { |
283 | new_entry = NULL; | 363 | if (end <= entry->start) { |
364 | where = entry->nd.prev; | ||
365 | cached_entry = list_entry(where, struct memtype, nd); | ||
284 | break; | 366 | break; |
285 | } | 367 | } else if (start <= entry->start) { /* end > entry->start */ |
286 | 368 | err = chk_conflict(new, entry, new_type); | |
287 | if (start <= parse->start && end >= parse->start) { | 369 | if (!err) { |
288 | if (actual_type != parse->type && ret_type) { | 370 | dprintk("Overlap at 0x%Lx-0x%Lx\n", |
289 | actual_type = parse->type; | 371 | entry->start, entry->end); |
290 | *ret_type = actual_type; | 372 | where = entry->nd.prev; |
291 | new_entry->type = actual_type; | 373 | cached_entry = list_entry(where, |
292 | } | 374 | struct memtype, nd); |
293 | |||
294 | if (actual_type != parse->type) { | ||
295 | printk( | ||
296 | KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", | ||
297 | current->comm, current->pid, | ||
298 | start, end, | ||
299 | cattr_name(actual_type), | ||
300 | cattr_name(parse->type)); | ||
301 | err = -EBUSY; | ||
302 | break; | ||
303 | } | 375 | } |
304 | |||
305 | saved_ptr = parse; | ||
306 | /* | ||
307 | * Check to see whether the request overlaps more | ||
308 | * than one entry in the list | ||
309 | */ | ||
310 | list_for_each_entry_continue(parse, &memtype_list, nd) { | ||
311 | if (end <= parse->start) { | ||
312 | break; | ||
313 | } | ||
314 | |||
315 | if (actual_type != parse->type) { | ||
316 | printk( | ||
317 | KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", | ||
318 | current->comm, current->pid, | ||
319 | start, end, | ||
320 | cattr_name(actual_type), | ||
321 | cattr_name(parse->type)); | ||
322 | err = -EBUSY; | ||
323 | break; | ||
324 | } | ||
325 | } | ||
326 | |||
327 | if (err) { | ||
328 | break; | ||
329 | } | ||
330 | |||
331 | pr_debug("Overlap at 0x%Lx-0x%Lx\n", | ||
332 | saved_ptr->start, saved_ptr->end); | ||
333 | /* No conflict. Go ahead and add this new entry */ | ||
334 | list_add(&new_entry->nd, saved_ptr->nd.prev); | ||
335 | new_entry = NULL; | ||
336 | break; | 376 | break; |
337 | } | 377 | } else if (start < entry->end) { /* start > entry->start */ |
338 | 378 | err = chk_conflict(new, entry, new_type); | |
339 | if (start < parse->end) { | 379 | if (!err) { |
340 | if (actual_type != parse->type && ret_type) { | 380 | dprintk("Overlap at 0x%Lx-0x%Lx\n", |
341 | actual_type = parse->type; | 381 | entry->start, entry->end); |
342 | *ret_type = actual_type; | 382 | cached_entry = list_entry(entry->nd.prev, |
343 | new_entry->type = actual_type; | 383 | struct memtype, nd); |
344 | } | 384 | |
345 | 385 | /* | |
346 | if (actual_type != parse->type) { | 386 | * Move to right position in the linked |
347 | printk( | 387 | * list to add this new entry |
348 | KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", | 388 | */ |
349 | current->comm, current->pid, | 389 | list_for_each_entry_continue(entry, |
350 | start, end, | 390 | &memtype_list, nd) { |
351 | cattr_name(actual_type), | 391 | if (start <= entry->start) { |
352 | cattr_name(parse->type)); | 392 | where = entry->nd.prev; |
353 | err = -EBUSY; | 393 | break; |
354 | break; | 394 | } |
355 | } | ||
356 | |||
357 | saved_ptr = parse; | ||
358 | /* | ||
359 | * Check to see whether the request overlaps more | ||
360 | * than one entry in the list | ||
361 | */ | ||
362 | list_for_each_entry_continue(parse, &memtype_list, nd) { | ||
363 | if (end <= parse->start) { | ||
364 | break; | ||
365 | } | ||
366 | |||
367 | if (actual_type != parse->type) { | ||
368 | printk( | ||
369 | KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n", | ||
370 | current->comm, current->pid, | ||
371 | start, end, | ||
372 | cattr_name(actual_type), | ||
373 | cattr_name(parse->type)); | ||
374 | err = -EBUSY; | ||
375 | break; | ||
376 | } | 395 | } |
377 | } | 396 | } |
378 | |||
379 | if (err) { | ||
380 | break; | ||
381 | } | ||
382 | |||
383 | pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n", | ||
384 | saved_ptr->start, saved_ptr->end); | ||
385 | /* No conflict. Go ahead and add this new entry */ | ||
386 | list_add(&new_entry->nd, &saved_ptr->nd); | ||
387 | new_entry = NULL; | ||
388 | break; | 397 | break; |
389 | } | 398 | } |
390 | } | 399 | } |
391 | 400 | ||
392 | if (err) { | 401 | if (err) { |
393 | printk(KERN_INFO | 402 | printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " |
394 | "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", | 403 | "track %s, req %s\n", |
395 | start, end, cattr_name(new_entry->type), | 404 | start, end, cattr_name(new->type), cattr_name(req_type)); |
396 | cattr_name(req_type)); | 405 | kfree(new); |
397 | kfree(new_entry); | ||
398 | spin_unlock(&memtype_lock); | 406 | spin_unlock(&memtype_lock); |
407 | |||
399 | return err; | 408 | return err; |
400 | } | 409 | } |
401 | 410 | ||
402 | if (new_entry) { | 411 | cached_start = start; |
403 | /* No conflict. Not yet added to the list. Add to the tail */ | ||
404 | list_add_tail(&new_entry->nd, &memtype_list); | ||
405 | pr_debug("New Entry\n"); | ||
406 | } | ||
407 | 412 | ||
408 | if (ret_type) { | 413 | if (where) |
409 | pr_debug( | 414 | list_add(&new->nd, where); |
410 | "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", | 415 | else |
411 | start, end, cattr_name(actual_type), | 416 | list_add_tail(&new->nd, &memtype_list); |
412 | cattr_name(req_type), cattr_name(*ret_type)); | ||
413 | } else { | ||
414 | pr_debug( | ||
415 | "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", | ||
416 | start, end, cattr_name(actual_type), | ||
417 | cattr_name(req_type)); | ||
418 | } | ||
419 | 417 | ||
420 | spin_unlock(&memtype_lock); | 418 | spin_unlock(&memtype_lock); |
419 | |||
420 | dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", | ||
421 | start, end, cattr_name(new->type), cattr_name(req_type), | ||
422 | new_type ? cattr_name(*new_type) : "-"); | ||
423 | |||
421 | return err; | 424 | return err; |
422 | } | 425 | } |
423 | 426 | ||
424 | int free_memtype(u64 start, u64 end) | 427 | int free_memtype(u64 start, u64 end) |
425 | { | 428 | { |
426 | struct memtype *ml; | 429 | struct memtype *entry; |
427 | int err = -EINVAL; | 430 | int err = -EINVAL; |
431 | int is_range_ram; | ||
428 | 432 | ||
429 | /* Only track when pat_wc_enabled */ | 433 | if (!pat_enabled) |
430 | if (!pat_wc_enabled) { | ||
431 | return 0; | 434 | return 0; |
432 | } | ||
433 | 435 | ||
434 | /* Low ISA region is always mapped WB. No need to track */ | 436 | /* Low ISA region is always mapped WB. No need to track */ |
435 | if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) { | 437 | if (is_ISA_range(start, end - 1)) |
436 | return 0; | 438 | return 0; |
437 | } | 439 | |
440 | is_range_ram = pagerange_is_ram(start, end); | ||
441 | if (is_range_ram == 1) | ||
442 | return free_ram_pages_type(start, end); | ||
443 | else if (is_range_ram < 0) | ||
444 | return -EINVAL; | ||
438 | 445 | ||
439 | spin_lock(&memtype_lock); | 446 | spin_lock(&memtype_lock); |
440 | list_for_each_entry(ml, &memtype_list, nd) { | 447 | list_for_each_entry(entry, &memtype_list, nd) { |
441 | if (ml->start == start && ml->end == end) { | 448 | if (entry->start == start && entry->end == end) { |
442 | list_del(&ml->nd); | 449 | if (cached_entry == entry || cached_start == start) |
443 | kfree(ml); | 450 | cached_entry = NULL; |
451 | |||
452 | list_del(&entry->nd); | ||
453 | kfree(entry); | ||
444 | err = 0; | 454 | err = 0; |
445 | break; | 455 | break; |
446 | } | 456 | } |
@@ -452,27 +462,20 @@ int free_memtype(u64 start, u64 end) | |||
452 | current->comm, current->pid, start, end); | 462 | current->comm, current->pid, start, end); |
453 | } | 463 | } |
454 | 464 | ||
455 | pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); | 465 | dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); |
466 | |||
456 | return err; | 467 | return err; |
457 | } | 468 | } |
458 | 469 | ||
459 | 470 | ||
460 | /* | ||
461 | * /dev/mem mmap interface. The memtype used for mapping varies: | ||
462 | * - Use UC for mappings with O_SYNC flag | ||
463 | * - Without O_SYNC flag, if there is any conflict in reserve_memtype, | ||
464 | * inherit the memtype from existing mapping. | ||
465 | * - Else use UC_MINUS memtype (for backward compatibility with existing | ||
466 | * X drivers. | ||
467 | */ | ||
468 | pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, | 471 | pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, |
469 | unsigned long size, pgprot_t vma_prot) | 472 | unsigned long size, pgprot_t vma_prot) |
470 | { | 473 | { |
471 | return vma_prot; | 474 | return vma_prot; |
472 | } | 475 | } |
473 | 476 | ||
474 | #ifdef CONFIG_NONPROMISC_DEVMEM | 477 | #ifdef CONFIG_STRICT_DEVMEM |
475 | /* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ | 478 | /* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/ |
476 | static inline int range_is_allowed(unsigned long pfn, unsigned long size) | 479 | static inline int range_is_allowed(unsigned long pfn, unsigned long size) |
477 | { | 480 | { |
478 | return 1; | 481 | return 1; |
@@ -496,20 +499,20 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) | |||
496 | } | 499 | } |
497 | return 1; | 500 | return 1; |
498 | } | 501 | } |
499 | #endif /* CONFIG_NONPROMISC_DEVMEM */ | 502 | #endif /* CONFIG_STRICT_DEVMEM */ |
500 | 503 | ||
501 | int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, | 504 | int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, |
502 | unsigned long size, pgprot_t *vma_prot) | 505 | unsigned long size, pgprot_t *vma_prot) |
503 | { | 506 | { |
504 | u64 offset = ((u64) pfn) << PAGE_SHIFT; | 507 | u64 offset = ((u64) pfn) << PAGE_SHIFT; |
505 | unsigned long flags = _PAGE_CACHE_UC_MINUS; | 508 | unsigned long flags = -1; |
506 | int retval; | 509 | int retval; |
507 | 510 | ||
508 | if (!range_is_allowed(pfn, size)) | 511 | if (!range_is_allowed(pfn, size)) |
509 | return 0; | 512 | return 0; |
510 | 513 | ||
511 | if (file->f_flags & O_SYNC) { | 514 | if (file->f_flags & O_SYNC) { |
512 | flags = _PAGE_CACHE_UC; | 515 | flags = _PAGE_CACHE_UC_MINUS; |
513 | } | 516 | } |
514 | 517 | ||
515 | #ifdef CONFIG_X86_32 | 518 | #ifdef CONFIG_X86_32 |
@@ -521,24 +524,25 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, | |||
521 | * caching for the high addresses through the KEN pin, but | 524 | * caching for the high addresses through the KEN pin, but |
522 | * we maintain the tradition of paranoia in this code. | 525 | * we maintain the tradition of paranoia in this code. |
523 | */ | 526 | */ |
524 | if (!pat_wc_enabled && | 527 | if (!pat_enabled && |
525 | ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || | 528 | !(boot_cpu_has(X86_FEATURE_MTRR) || |
526 | test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || | 529 | boot_cpu_has(X86_FEATURE_K6_MTRR) || |
527 | test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || | 530 | boot_cpu_has(X86_FEATURE_CYRIX_ARR) || |
528 | test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && | 531 | boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) && |
529 | (pfn << PAGE_SHIFT) >= __pa(high_memory)) { | 532 | (pfn << PAGE_SHIFT) >= __pa(high_memory)) { |
530 | flags = _PAGE_CACHE_UC; | 533 | flags = _PAGE_CACHE_UC; |
531 | } | 534 | } |
532 | #endif | 535 | #endif |
533 | 536 | ||
534 | /* | 537 | /* |
535 | * With O_SYNC, we can only take UC mapping. Fail if we cannot. | 538 | * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot. |
539 | * | ||
536 | * Without O_SYNC, we want to get | 540 | * Without O_SYNC, we want to get |
537 | * - WB for WB-able memory and no other conflicting mappings | 541 | * - WB for WB-able memory and no other conflicting mappings |
538 | * - UC_MINUS for non-WB-able memory with no other conflicting mappings | 542 | * - UC_MINUS for non-WB-able memory with no other conflicting mappings |
539 | * - Inherit from confliting mappings otherwise | 543 | * - Inherit from confliting mappings otherwise |
540 | */ | 544 | */ |
541 | if (flags != _PAGE_CACHE_UC_MINUS) { | 545 | if (flags != -1) { |
542 | retval = reserve_memtype(offset, offset + size, flags, NULL); | 546 | retval = reserve_memtype(offset, offset + size, flags, NULL); |
543 | } else { | 547 | } else { |
544 | retval = reserve_memtype(offset, offset + size, -1, &flags); | 548 | retval = reserve_memtype(offset, offset + size, -1, &flags); |
@@ -547,8 +551,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, | |||
547 | if (retval < 0) | 551 | if (retval < 0) |
548 | return 0; | 552 | return 0; |
549 | 553 | ||
550 | if (pfn <= max_pfn_mapped && | 554 | if (((pfn < max_low_pfn_mapped) || |
551 | ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { | 555 | (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) && |
556 | ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { | ||
552 | free_memtype(offset, offset + size); | 557 | free_memtype(offset, offset + size); |
553 | printk(KERN_INFO | 558 | printk(KERN_INFO |
554 | "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", | 559 | "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", |
@@ -565,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, | |||
565 | 570 | ||
566 | void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) | 571 | void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) |
567 | { | 572 | { |
573 | unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); | ||
568 | u64 addr = (u64)pfn << PAGE_SHIFT; | 574 | u64 addr = (u64)pfn << PAGE_SHIFT; |
569 | unsigned long flags; | 575 | unsigned long flags; |
570 | unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); | ||
571 | 576 | ||
572 | reserve_memtype(addr, addr + size, want_flags, &flags); | 577 | reserve_memtype(addr, addr + size, want_flags, &flags); |
573 | if (flags != want_flags) { | 578 | if (flags != want_flags) { |
@@ -587,3 +592,90 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) | |||
587 | free_memtype(addr, addr + size); | 592 | free_memtype(addr, addr + size); |
588 | } | 593 | } |
589 | 594 | ||
595 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) | ||
596 | |||
597 | /* get Nth element of the linked list */ | ||
598 | static struct memtype *memtype_get_idx(loff_t pos) | ||
599 | { | ||
600 | struct memtype *list_node, *print_entry; | ||
601 | int i = 1; | ||
602 | |||
603 | print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); | ||
604 | if (!print_entry) | ||
605 | return NULL; | ||
606 | |||
607 | spin_lock(&memtype_lock); | ||
608 | list_for_each_entry(list_node, &memtype_list, nd) { | ||
609 | if (pos == i) { | ||
610 | *print_entry = *list_node; | ||
611 | spin_unlock(&memtype_lock); | ||
612 | return print_entry; | ||
613 | } | ||
614 | ++i; | ||
615 | } | ||
616 | spin_unlock(&memtype_lock); | ||
617 | kfree(print_entry); | ||
618 | |||
619 | return NULL; | ||
620 | } | ||
621 | |||
622 | static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) | ||
623 | { | ||
624 | if (*pos == 0) { | ||
625 | ++*pos; | ||
626 | seq_printf(seq, "PAT memtype list:\n"); | ||
627 | } | ||
628 | |||
629 | return memtype_get_idx(*pos); | ||
630 | } | ||
631 | |||
632 | static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
633 | { | ||
634 | ++*pos; | ||
635 | return memtype_get_idx(*pos); | ||
636 | } | ||
637 | |||
638 | static void memtype_seq_stop(struct seq_file *seq, void *v) | ||
639 | { | ||
640 | } | ||
641 | |||
642 | static int memtype_seq_show(struct seq_file *seq, void *v) | ||
643 | { | ||
644 | struct memtype *print_entry = (struct memtype *)v; | ||
645 | |||
646 | seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), | ||
647 | print_entry->start, print_entry->end); | ||
648 | kfree(print_entry); | ||
649 | |||
650 | return 0; | ||
651 | } | ||
652 | |||
653 | static struct seq_operations memtype_seq_ops = { | ||
654 | .start = memtype_seq_start, | ||
655 | .next = memtype_seq_next, | ||
656 | .stop = memtype_seq_stop, | ||
657 | .show = memtype_seq_show, | ||
658 | }; | ||
659 | |||
660 | static int memtype_seq_open(struct inode *inode, struct file *file) | ||
661 | { | ||
662 | return seq_open(file, &memtype_seq_ops); | ||
663 | } | ||
664 | |||
665 | static const struct file_operations memtype_fops = { | ||
666 | .open = memtype_seq_open, | ||
667 | .read = seq_read, | ||
668 | .llseek = seq_lseek, | ||
669 | .release = seq_release, | ||
670 | }; | ||
671 | |||
672 | static int __init pat_memtype_list_init(void) | ||
673 | { | ||
674 | debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, | ||
675 | NULL, &memtype_fops); | ||
676 | return 0; | ||
677 | } | ||
678 | |||
679 | late_initcall(pat_memtype_list_init); | ||
680 | |||
681 | #endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ | ||
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c new file mode 100644 index 000000000000..efa1911e20ca --- /dev/null +++ b/arch/x86/mm/pf_in.c | |||
@@ -0,0 +1,489 @@ | |||
1 | /* | ||
2 | * Fault Injection Test harness (FI) | ||
3 | * Copyright (C) Intel Crop. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License | ||
7 | * as published by the Free Software Foundation; either version 2 | ||
8 | * of the License, or (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | ||
18 | * USA. | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | /* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp | ||
23 | * Copyright by Intel Crop., 2002 | ||
24 | * Louis Zhuang (louis.zhuang@intel.com) | ||
25 | * | ||
26 | * Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007 | ||
27 | */ | ||
28 | |||
29 | #include <linux/module.h> | ||
30 | #include <linux/ptrace.h> /* struct pt_regs */ | ||
31 | #include "pf_in.h" | ||
32 | |||
33 | #ifdef __i386__ | ||
34 | /* IA32 Manual 3, 2-1 */ | ||
35 | static unsigned char prefix_codes[] = { | ||
36 | 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64, | ||
37 | 0x65, 0x2E, 0x3E, 0x66, 0x67 | ||
38 | }; | ||
39 | /* IA32 Manual 3, 3-432*/ | ||
40 | static unsigned int reg_rop[] = { | ||
41 | 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F | ||
42 | }; | ||
43 | static unsigned int reg_wop[] = { 0x88, 0x89 }; | ||
44 | static unsigned int imm_wop[] = { 0xC6, 0xC7 }; | ||
45 | /* IA32 Manual 3, 3-432*/ | ||
46 | static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 }; | ||
47 | static unsigned int rw32[] = { | ||
48 | 0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F | ||
49 | }; | ||
50 | static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F }; | ||
51 | static unsigned int mw16[] = { 0xB70F, 0xBF0F }; | ||
52 | static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 }; | ||
53 | static unsigned int mw64[] = {}; | ||
54 | #else /* not __i386__ */ | ||
55 | static unsigned char prefix_codes[] = { | ||
56 | 0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36, | ||
57 | 0xF0, 0xF3, 0xF2, | ||
58 | /* REX Prefixes */ | ||
59 | 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, | ||
60 | 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f | ||
61 | }; | ||
62 | /* AMD64 Manual 3, Appendix A*/ | ||
63 | static unsigned int reg_rop[] = { | ||
64 | 0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F | ||
65 | }; | ||
66 | static unsigned int reg_wop[] = { 0x88, 0x89 }; | ||
67 | static unsigned int imm_wop[] = { 0xC6, 0xC7 }; | ||
68 | static unsigned int rw8[] = { 0xC6, 0x88, 0x8A }; | ||
69 | static unsigned int rw32[] = { | ||
70 | 0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F | ||
71 | }; | ||
72 | /* 8 bit only */ | ||
73 | static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F }; | ||
74 | /* 16 bit only */ | ||
75 | static unsigned int mw16[] = { 0xB70F, 0xBF0F }; | ||
76 | /* 16 or 32 bit */ | ||
77 | static unsigned int mw32[] = { 0xC7 }; | ||
78 | /* 16, 32 or 64 bit */ | ||
79 | static unsigned int mw64[] = { 0x89, 0x8B }; | ||
80 | #endif /* not __i386__ */ | ||
81 | |||
82 | static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, | ||
83 | int *rexr) | ||
84 | { | ||
85 | int i; | ||
86 | unsigned char *p = addr; | ||
87 | *shorted = 0; | ||
88 | *enlarged = 0; | ||
89 | *rexr = 0; | ||
90 | |||
91 | restart: | ||
92 | for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { | ||
93 | if (*p == prefix_codes[i]) { | ||
94 | if (*p == 0x66) | ||
95 | *shorted = 1; | ||
96 | #ifdef __amd64__ | ||
97 | if ((*p & 0xf8) == 0x48) | ||
98 | *enlarged = 1; | ||
99 | if ((*p & 0xf4) == 0x44) | ||
100 | *rexr = 1; | ||
101 | #endif | ||
102 | p++; | ||
103 | goto restart; | ||
104 | } | ||
105 | } | ||
106 | |||
107 | return (p - addr); | ||
108 | } | ||
109 | |||
110 | static int get_opcode(unsigned char *addr, unsigned int *opcode) | ||
111 | { | ||
112 | int len; | ||
113 | |||
114 | if (*addr == 0x0F) { | ||
115 | /* 0x0F is extension instruction */ | ||
116 | *opcode = *(unsigned short *)addr; | ||
117 | len = 2; | ||
118 | } else { | ||
119 | *opcode = *addr; | ||
120 | len = 1; | ||
121 | } | ||
122 | |||
123 | return len; | ||
124 | } | ||
125 | |||
126 | #define CHECK_OP_TYPE(opcode, array, type) \ | ||
127 | for (i = 0; i < ARRAY_SIZE(array); i++) { \ | ||
128 | if (array[i] == opcode) { \ | ||
129 | rv = type; \ | ||
130 | goto exit; \ | ||
131 | } \ | ||
132 | } | ||
133 | |||
134 | enum reason_type get_ins_type(unsigned long ins_addr) | ||
135 | { | ||
136 | unsigned int opcode; | ||
137 | unsigned char *p; | ||
138 | int shorted, enlarged, rexr; | ||
139 | int i; | ||
140 | enum reason_type rv = OTHERS; | ||
141 | |||
142 | p = (unsigned char *)ins_addr; | ||
143 | p += skip_prefix(p, &shorted, &enlarged, &rexr); | ||
144 | p += get_opcode(p, &opcode); | ||
145 | |||
146 | CHECK_OP_TYPE(opcode, reg_rop, REG_READ); | ||
147 | CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE); | ||
148 | CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE); | ||
149 | |||
150 | exit: | ||
151 | return rv; | ||
152 | } | ||
153 | #undef CHECK_OP_TYPE | ||
154 | |||
155 | static unsigned int get_ins_reg_width(unsigned long ins_addr) | ||
156 | { | ||
157 | unsigned int opcode; | ||
158 | unsigned char *p; | ||
159 | int i, shorted, enlarged, rexr; | ||
160 | |||
161 | p = (unsigned char *)ins_addr; | ||
162 | p += skip_prefix(p, &shorted, &enlarged, &rexr); | ||
163 | p += get_opcode(p, &opcode); | ||
164 | |||
165 | for (i = 0; i < ARRAY_SIZE(rw8); i++) | ||
166 | if (rw8[i] == opcode) | ||
167 | return 1; | ||
168 | |||
169 | for (i = 0; i < ARRAY_SIZE(rw32); i++) | ||
170 | if (rw32[i] == opcode) | ||
171 | return (shorted ? 2 : (enlarged ? 8 : 4)); | ||
172 | |||
173 | printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); | ||
174 | return 0; | ||
175 | } | ||
176 | |||
177 | unsigned int get_ins_mem_width(unsigned long ins_addr) | ||
178 | { | ||
179 | unsigned int opcode; | ||
180 | unsigned char *p; | ||
181 | int i, shorted, enlarged, rexr; | ||
182 | |||
183 | p = (unsigned char *)ins_addr; | ||
184 | p += skip_prefix(p, &shorted, &enlarged, &rexr); | ||
185 | p += get_opcode(p, &opcode); | ||
186 | |||
187 | for (i = 0; i < ARRAY_SIZE(mw8); i++) | ||
188 | if (mw8[i] == opcode) | ||
189 | return 1; | ||
190 | |||
191 | for (i = 0; i < ARRAY_SIZE(mw16); i++) | ||
192 | if (mw16[i] == opcode) | ||
193 | return 2; | ||
194 | |||
195 | for (i = 0; i < ARRAY_SIZE(mw32); i++) | ||
196 | if (mw32[i] == opcode) | ||
197 | return shorted ? 2 : 4; | ||
198 | |||
199 | for (i = 0; i < ARRAY_SIZE(mw64); i++) | ||
200 | if (mw64[i] == opcode) | ||
201 | return shorted ? 2 : (enlarged ? 8 : 4); | ||
202 | |||
203 | printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); | ||
204 | return 0; | ||
205 | } | ||
206 | |||
207 | /* | ||
208 | * Define register ident in mod/rm byte. | ||
209 | * Note: these are NOT the same as in ptrace-abi.h. | ||
210 | */ | ||
211 | enum { | ||
212 | arg_AL = 0, | ||
213 | arg_CL = 1, | ||
214 | arg_DL = 2, | ||
215 | arg_BL = 3, | ||
216 | arg_AH = 4, | ||
217 | arg_CH = 5, | ||
218 | arg_DH = 6, | ||
219 | arg_BH = 7, | ||
220 | |||
221 | arg_AX = 0, | ||
222 | arg_CX = 1, | ||
223 | arg_DX = 2, | ||
224 | arg_BX = 3, | ||
225 | arg_SP = 4, | ||
226 | arg_BP = 5, | ||
227 | arg_SI = 6, | ||
228 | arg_DI = 7, | ||
229 | #ifdef __amd64__ | ||
230 | arg_R8 = 8, | ||
231 | arg_R9 = 9, | ||
232 | arg_R10 = 10, | ||
233 | arg_R11 = 11, | ||
234 | arg_R12 = 12, | ||
235 | arg_R13 = 13, | ||
236 | arg_R14 = 14, | ||
237 | arg_R15 = 15 | ||
238 | #endif | ||
239 | }; | ||
240 | |||
241 | static unsigned char *get_reg_w8(int no, struct pt_regs *regs) | ||
242 | { | ||
243 | unsigned char *rv = NULL; | ||
244 | |||
245 | switch (no) { | ||
246 | case arg_AL: | ||
247 | rv = (unsigned char *)®s->ax; | ||
248 | break; | ||
249 | case arg_BL: | ||
250 | rv = (unsigned char *)®s->bx; | ||
251 | break; | ||
252 | case arg_CL: | ||
253 | rv = (unsigned char *)®s->cx; | ||
254 | break; | ||
255 | case arg_DL: | ||
256 | rv = (unsigned char *)®s->dx; | ||
257 | break; | ||
258 | case arg_AH: | ||
259 | rv = 1 + (unsigned char *)®s->ax; | ||
260 | break; | ||
261 | case arg_BH: | ||
262 | rv = 1 + (unsigned char *)®s->bx; | ||
263 | break; | ||
264 | case arg_CH: | ||
265 | rv = 1 + (unsigned char *)®s->cx; | ||
266 | break; | ||
267 | case arg_DH: | ||
268 | rv = 1 + (unsigned char *)®s->dx; | ||
269 | break; | ||
270 | #ifdef __amd64__ | ||
271 | case arg_R8: | ||
272 | rv = (unsigned char *)®s->r8; | ||
273 | break; | ||
274 | case arg_R9: | ||
275 | rv = (unsigned char *)®s->r9; | ||
276 | break; | ||
277 | case arg_R10: | ||
278 | rv = (unsigned char *)®s->r10; | ||
279 | break; | ||
280 | case arg_R11: | ||
281 | rv = (unsigned char *)®s->r11; | ||
282 | break; | ||
283 | case arg_R12: | ||
284 | rv = (unsigned char *)®s->r12; | ||
285 | break; | ||
286 | case arg_R13: | ||
287 | rv = (unsigned char *)®s->r13; | ||
288 | break; | ||
289 | case arg_R14: | ||
290 | rv = (unsigned char *)®s->r14; | ||
291 | break; | ||
292 | case arg_R15: | ||
293 | rv = (unsigned char *)®s->r15; | ||
294 | break; | ||
295 | #endif | ||
296 | default: | ||
297 | printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); | ||
298 | break; | ||
299 | } | ||
300 | return rv; | ||
301 | } | ||
302 | |||
303 | static unsigned long *get_reg_w32(int no, struct pt_regs *regs) | ||
304 | { | ||
305 | unsigned long *rv = NULL; | ||
306 | |||
307 | switch (no) { | ||
308 | case arg_AX: | ||
309 | rv = ®s->ax; | ||
310 | break; | ||
311 | case arg_BX: | ||
312 | rv = ®s->bx; | ||
313 | break; | ||
314 | case arg_CX: | ||
315 | rv = ®s->cx; | ||
316 | break; | ||
317 | case arg_DX: | ||
318 | rv = ®s->dx; | ||
319 | break; | ||
320 | case arg_SP: | ||
321 | rv = ®s->sp; | ||
322 | break; | ||
323 | case arg_BP: | ||
324 | rv = ®s->bp; | ||
325 | break; | ||
326 | case arg_SI: | ||
327 | rv = ®s->si; | ||
328 | break; | ||
329 | case arg_DI: | ||
330 | rv = ®s->di; | ||
331 | break; | ||
332 | #ifdef __amd64__ | ||
333 | case arg_R8: | ||
334 | rv = ®s->r8; | ||
335 | break; | ||
336 | case arg_R9: | ||
337 | rv = ®s->r9; | ||
338 | break; | ||
339 | case arg_R10: | ||
340 | rv = ®s->r10; | ||
341 | break; | ||
342 | case arg_R11: | ||
343 | rv = ®s->r11; | ||
344 | break; | ||
345 | case arg_R12: | ||
346 | rv = ®s->r12; | ||
347 | break; | ||
348 | case arg_R13: | ||
349 | rv = ®s->r13; | ||
350 | break; | ||
351 | case arg_R14: | ||
352 | rv = ®s->r14; | ||
353 | break; | ||
354 | case arg_R15: | ||
355 | rv = ®s->r15; | ||
356 | break; | ||
357 | #endif | ||
358 | default: | ||
359 | printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); | ||
360 | } | ||
361 | |||
362 | return rv; | ||
363 | } | ||
364 | |||
365 | unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) | ||
366 | { | ||
367 | unsigned int opcode; | ||
368 | unsigned char mod_rm; | ||
369 | int reg; | ||
370 | unsigned char *p; | ||
371 | int i, shorted, enlarged, rexr; | ||
372 | unsigned long rv; | ||
373 | |||
374 | p = (unsigned char *)ins_addr; | ||
375 | p += skip_prefix(p, &shorted, &enlarged, &rexr); | ||
376 | p += get_opcode(p, &opcode); | ||
377 | for (i = 0; i < ARRAY_SIZE(reg_rop); i++) | ||
378 | if (reg_rop[i] == opcode) { | ||
379 | rv = REG_READ; | ||
380 | goto do_work; | ||
381 | } | ||
382 | |||
383 | for (i = 0; i < ARRAY_SIZE(reg_wop); i++) | ||
384 | if (reg_wop[i] == opcode) { | ||
385 | rv = REG_WRITE; | ||
386 | goto do_work; | ||
387 | } | ||
388 | |||
389 | printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " | ||
390 | "0x%02x\n", opcode); | ||
391 | goto err; | ||
392 | |||
393 | do_work: | ||
394 | mod_rm = *p; | ||
395 | reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); | ||
396 | switch (get_ins_reg_width(ins_addr)) { | ||
397 | case 1: | ||
398 | return *get_reg_w8(reg, regs); | ||
399 | |||
400 | case 2: | ||
401 | return *(unsigned short *)get_reg_w32(reg, regs); | ||
402 | |||
403 | case 4: | ||
404 | return *(unsigned int *)get_reg_w32(reg, regs); | ||
405 | |||
406 | #ifdef __amd64__ | ||
407 | case 8: | ||
408 | return *(unsigned long *)get_reg_w32(reg, regs); | ||
409 | #endif | ||
410 | |||
411 | default: | ||
412 | printk(KERN_ERR "mmiotrace: Error width# %d\n", reg); | ||
413 | } | ||
414 | |||
415 | err: | ||
416 | return 0; | ||
417 | } | ||
418 | |||
419 | unsigned long get_ins_imm_val(unsigned long ins_addr) | ||
420 | { | ||
421 | unsigned int opcode; | ||
422 | unsigned char mod_rm; | ||
423 | unsigned char mod; | ||
424 | unsigned char *p; | ||
425 | int i, shorted, enlarged, rexr; | ||
426 | unsigned long rv; | ||
427 | |||
428 | p = (unsigned char *)ins_addr; | ||
429 | p += skip_prefix(p, &shorted, &enlarged, &rexr); | ||
430 | p += get_opcode(p, &opcode); | ||
431 | for (i = 0; i < ARRAY_SIZE(imm_wop); i++) | ||
432 | if (imm_wop[i] == opcode) { | ||
433 | rv = IMM_WRITE; | ||
434 | goto do_work; | ||
435 | } | ||
436 | |||
437 | printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " | ||
438 | "0x%02x\n", opcode); | ||
439 | goto err; | ||
440 | |||
441 | do_work: | ||
442 | mod_rm = *p; | ||
443 | mod = mod_rm >> 6; | ||
444 | p++; | ||
445 | switch (mod) { | ||
446 | case 0: | ||
447 | /* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */ | ||
448 | /* AMD64: XXX Check for address size prefix? */ | ||
449 | if ((mod_rm & 0x7) == 0x5) | ||
450 | p += 4; | ||
451 | break; | ||
452 | |||
453 | case 1: | ||
454 | p += 1; | ||
455 | break; | ||
456 | |||
457 | case 2: | ||
458 | p += 4; | ||
459 | break; | ||
460 | |||
461 | case 3: | ||
462 | default: | ||
463 | printk(KERN_ERR "mmiotrace: not a memory access instruction " | ||
464 | "at 0x%lx, rm_mod=0x%02x\n", | ||
465 | ins_addr, mod_rm); | ||
466 | } | ||
467 | |||
468 | switch (get_ins_reg_width(ins_addr)) { | ||
469 | case 1: | ||
470 | return *(unsigned char *)p; | ||
471 | |||
472 | case 2: | ||
473 | return *(unsigned short *)p; | ||
474 | |||
475 | case 4: | ||
476 | return *(unsigned int *)p; | ||
477 | |||
478 | #ifdef __amd64__ | ||
479 | case 8: | ||
480 | return *(unsigned long *)p; | ||
481 | #endif | ||
482 | |||
483 | default: | ||
484 | printk(KERN_ERR "mmiotrace: Error: width.\n"); | ||
485 | } | ||
486 | |||
487 | err: | ||
488 | return 0; | ||
489 | } | ||
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h new file mode 100644 index 000000000000..e05341a51a27 --- /dev/null +++ b/arch/x86/mm/pf_in.h | |||
@@ -0,0 +1,39 @@ | |||
1 | /* | ||
2 | * Fault Injection Test harness (FI) | ||
3 | * Copyright (C) Intel Crop. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public License | ||
7 | * as published by the Free Software Foundation; either version 2 | ||
8 | * of the License, or (at your option) any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, | ||
18 | * USA. | ||
19 | * | ||
20 | */ | ||
21 | |||
22 | #ifndef __PF_H_ | ||
23 | #define __PF_H_ | ||
24 | |||
25 | enum reason_type { | ||
26 | NOT_ME, /* page fault is not in regions */ | ||
27 | NOTHING, /* access others point in regions */ | ||
28 | REG_READ, /* read from addr to reg */ | ||
29 | REG_WRITE, /* write from reg to addr */ | ||
30 | IMM_WRITE, /* write from imm to addr */ | ||
31 | OTHERS /* Other instructions can not intercept */ | ||
32 | }; | ||
33 | |||
34 | enum reason_type get_ins_type(unsigned long ins_addr); | ||
35 | unsigned int get_ins_mem_width(unsigned long ins_addr); | ||
36 | unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs); | ||
37 | unsigned long get_ins_imm_val(unsigned long ins_addr); | ||
38 | |||
39 | #endif /* __PF_H_ */ | ||
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 50159764f694..86f2ffc43c3d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <asm/pgalloc.h> | 2 | #include <asm/pgalloc.h> |
3 | #include <asm/pgtable.h> | 3 | #include <asm/pgtable.h> |
4 | #include <asm/tlb.h> | 4 | #include <asm/tlb.h> |
5 | #include <asm/fixmap.h> | ||
5 | 6 | ||
6 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) | 7 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) |
7 | { | 8 | { |
@@ -62,16 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd) | |||
62 | #define UNSHARED_PTRS_PER_PGD \ | 63 | #define UNSHARED_PTRS_PER_PGD \ |
63 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) | 64 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) |
64 | 65 | ||
65 | static void pgd_ctor(void *p) | 66 | static void pgd_ctor(pgd_t *pgd) |
66 | { | 67 | { |
67 | pgd_t *pgd = p; | ||
68 | unsigned long flags; | ||
69 | |||
70 | /* Clear usermode parts of PGD */ | ||
71 | memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t)); | ||
72 | |||
73 | spin_lock_irqsave(&pgd_lock, flags); | ||
74 | |||
75 | /* If the pgd points to a shared pagetable level (either the | 68 | /* If the pgd points to a shared pagetable level (either the |
76 | ptes in non-PAE, or shared PMD in PAE), then just copy the | 69 | ptes in non-PAE, or shared PMD in PAE), then just copy the |
77 | references from swapper_pg_dir. */ | 70 | references from swapper_pg_dir. */ |
@@ -90,11 +83,9 @@ static void pgd_ctor(void *p) | |||
90 | /* list required to sync kernel mapping updates */ | 83 | /* list required to sync kernel mapping updates */ |
91 | if (!SHARED_KERNEL_PMD) | 84 | if (!SHARED_KERNEL_PMD) |
92 | pgd_list_add(pgd); | 85 | pgd_list_add(pgd); |
93 | |||
94 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
95 | } | 86 | } |
96 | 87 | ||
97 | static void pgd_dtor(void *pgd) | 88 | static void pgd_dtor(pgd_t *pgd) |
98 | { | 89 | { |
99 | unsigned long flags; /* can be called from interrupt context */ | 90 | unsigned long flags; /* can be called from interrupt context */ |
100 | 91 | ||
@@ -119,6 +110,72 @@ static void pgd_dtor(void *pgd) | |||
119 | 110 | ||
120 | #ifdef CONFIG_X86_PAE | 111 | #ifdef CONFIG_X86_PAE |
121 | /* | 112 | /* |
113 | * In PAE mode, we need to do a cr3 reload (=tlb flush) when | ||
114 | * updating the top-level pagetable entries to guarantee the | ||
115 | * processor notices the update. Since this is expensive, and | ||
116 | * all 4 top-level entries are used almost immediately in a | ||
117 | * new process's life, we just pre-populate them here. | ||
118 | * | ||
119 | * Also, if we're in a paravirt environment where the kernel pmd is | ||
120 | * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate | ||
121 | * and initialize the kernel pmds here. | ||
122 | */ | ||
123 | #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD | ||
124 | |||
125 | void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) | ||
126 | { | ||
127 | paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); | ||
128 | |||
129 | /* Note: almost everything apart from _PAGE_PRESENT is | ||
130 | reserved at the pmd (PDPT) level. */ | ||
131 | set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); | ||
132 | |||
133 | /* | ||
134 | * According to Intel App note "TLBs, Paging-Structure Caches, | ||
135 | * and Their Invalidation", April 2007, document 317080-001, | ||
136 | * section 8.1: in PAE mode we explicitly have to flush the | ||
137 | * TLB via cr3 if the top-level pgd is changed... | ||
138 | */ | ||
139 | if (mm == current->active_mm) | ||
140 | write_cr3(read_cr3()); | ||
141 | } | ||
142 | #else /* !CONFIG_X86_PAE */ | ||
143 | |||
144 | /* No need to prepopulate any pagetable entries in non-PAE modes. */ | ||
145 | #define PREALLOCATED_PMDS 0 | ||
146 | |||
147 | #endif /* CONFIG_X86_PAE */ | ||
148 | |||
149 | static void free_pmds(pmd_t *pmds[]) | ||
150 | { | ||
151 | int i; | ||
152 | |||
153 | for(i = 0; i < PREALLOCATED_PMDS; i++) | ||
154 | if (pmds[i]) | ||
155 | free_page((unsigned long)pmds[i]); | ||
156 | } | ||
157 | |||
158 | static int preallocate_pmds(pmd_t *pmds[]) | ||
159 | { | ||
160 | int i; | ||
161 | bool failed = false; | ||
162 | |||
163 | for(i = 0; i < PREALLOCATED_PMDS; i++) { | ||
164 | pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); | ||
165 | if (pmd == NULL) | ||
166 | failed = true; | ||
167 | pmds[i] = pmd; | ||
168 | } | ||
169 | |||
170 | if (failed) { | ||
171 | free_pmds(pmds); | ||
172 | return -ENOMEM; | ||
173 | } | ||
174 | |||
175 | return 0; | ||
176 | } | ||
177 | |||
178 | /* | ||
122 | * Mop up any pmd pages which may still be attached to the pgd. | 179 | * Mop up any pmd pages which may still be attached to the pgd. |
123 | * Normally they will be freed by munmap/exit_mmap, but any pmd we | 180 | * Normally they will be freed by munmap/exit_mmap, but any pmd we |
124 | * preallocate which never got a corresponding vma will need to be | 181 | * preallocate which never got a corresponding vma will need to be |
@@ -128,7 +185,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | |||
128 | { | 185 | { |
129 | int i; | 186 | int i; |
130 | 187 | ||
131 | for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { | 188 | for(i = 0; i < PREALLOCATED_PMDS; i++) { |
132 | pgd_t pgd = pgdp[i]; | 189 | pgd_t pgd = pgdp[i]; |
133 | 190 | ||
134 | if (pgd_val(pgd) != 0) { | 191 | if (pgd_val(pgd) != 0) { |
@@ -142,32 +199,20 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) | |||
142 | } | 199 | } |
143 | } | 200 | } |
144 | 201 | ||
145 | /* | 202 | static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) |
146 | * In PAE mode, we need to do a cr3 reload (=tlb flush) when | ||
147 | * updating the top-level pagetable entries to guarantee the | ||
148 | * processor notices the update. Since this is expensive, and | ||
149 | * all 4 top-level entries are used almost immediately in a | ||
150 | * new process's life, we just pre-populate them here. | ||
151 | * | ||
152 | * Also, if we're in a paravirt environment where the kernel pmd is | ||
153 | * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate | ||
154 | * and initialize the kernel pmds here. | ||
155 | */ | ||
156 | static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | ||
157 | { | 203 | { |
158 | pud_t *pud; | 204 | pud_t *pud; |
159 | unsigned long addr; | 205 | unsigned long addr; |
160 | int i; | 206 | int i; |
161 | 207 | ||
208 | if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ | ||
209 | return; | ||
210 | |||
162 | pud = pud_offset(pgd, 0); | 211 | pud = pud_offset(pgd, 0); |
163 | for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; | ||
164 | i++, pud++, addr += PUD_SIZE) { | ||
165 | pmd_t *pmd = pmd_alloc_one(mm, addr); | ||
166 | 212 | ||
167 | if (!pmd) { | 213 | for (addr = i = 0; i < PREALLOCATED_PMDS; |
168 | pgd_mop_up_pmds(mm, pgd); | 214 | i++, pud++, addr += PUD_SIZE) { |
169 | return 0; | 215 | pmd_t *pmd = pmds[i]; |
170 | } | ||
171 | 216 | ||
172 | if (i >= KERNEL_PGD_BOUNDARY) | 217 | if (i >= KERNEL_PGD_BOUNDARY) |
173 | memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), | 218 | memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), |
@@ -175,61 +220,54 @@ static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | |||
175 | 220 | ||
176 | pud_populate(mm, pud, pmd); | 221 | pud_populate(mm, pud, pmd); |
177 | } | 222 | } |
178 | |||
179 | return 1; | ||
180 | } | 223 | } |
181 | 224 | ||
182 | void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) | 225 | pgd_t *pgd_alloc(struct mm_struct *mm) |
183 | { | 226 | { |
184 | paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); | 227 | pgd_t *pgd; |
228 | pmd_t *pmds[PREALLOCATED_PMDS]; | ||
229 | unsigned long flags; | ||
185 | 230 | ||
186 | /* Note: almost everything apart from _PAGE_PRESENT is | 231 | pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); |
187 | reserved at the pmd (PDPT) level. */ | ||
188 | set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); | ||
189 | 232 | ||
190 | /* | 233 | if (pgd == NULL) |
191 | * According to Intel App note "TLBs, Paging-Structure Caches, | 234 | goto out; |
192 | * and Their Invalidation", April 2007, document 317080-001, | ||
193 | * section 8.1: in PAE mode we explicitly have to flush the | ||
194 | * TLB via cr3 if the top-level pgd is changed... | ||
195 | */ | ||
196 | if (mm == current->active_mm) | ||
197 | write_cr3(read_cr3()); | ||
198 | } | ||
199 | #else /* !CONFIG_X86_PAE */ | ||
200 | /* No need to prepopulate any pagetable entries in non-PAE modes. */ | ||
201 | static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) | ||
202 | { | ||
203 | return 1; | ||
204 | } | ||
205 | 235 | ||
206 | static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) | 236 | mm->pgd = pgd; |
207 | { | ||
208 | } | ||
209 | #endif /* CONFIG_X86_PAE */ | ||
210 | 237 | ||
211 | pgd_t *pgd_alloc(struct mm_struct *mm) | 238 | if (preallocate_pmds(pmds) != 0) |
212 | { | 239 | goto out_free_pgd; |
213 | pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); | ||
214 | 240 | ||
215 | /* so that alloc_pmd can use it */ | 241 | if (paravirt_pgd_alloc(mm) != 0) |
216 | mm->pgd = pgd; | 242 | goto out_free_pmds; |
217 | if (pgd) | ||
218 | pgd_ctor(pgd); | ||
219 | 243 | ||
220 | if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { | 244 | /* |
221 | pgd_dtor(pgd); | 245 | * Make sure that pre-populating the pmds is atomic with |
222 | free_page((unsigned long)pgd); | 246 | * respect to anything walking the pgd_list, so that they |
223 | pgd = NULL; | 247 | * never see a partially populated pgd. |
224 | } | 248 | */ |
249 | spin_lock_irqsave(&pgd_lock, flags); | ||
250 | |||
251 | pgd_ctor(pgd); | ||
252 | pgd_prepopulate_pmd(mm, pgd, pmds); | ||
253 | |||
254 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
225 | 255 | ||
226 | return pgd; | 256 | return pgd; |
257 | |||
258 | out_free_pmds: | ||
259 | free_pmds(pmds); | ||
260 | out_free_pgd: | ||
261 | free_page((unsigned long)pgd); | ||
262 | out: | ||
263 | return NULL; | ||
227 | } | 264 | } |
228 | 265 | ||
229 | void pgd_free(struct mm_struct *mm, pgd_t *pgd) | 266 | void pgd_free(struct mm_struct *mm, pgd_t *pgd) |
230 | { | 267 | { |
231 | pgd_mop_up_pmds(mm, pgd); | 268 | pgd_mop_up_pmds(mm, pgd); |
232 | pgd_dtor(pgd); | 269 | pgd_dtor(pgd); |
270 | paravirt_pgd_free(mm, pgd); | ||
233 | free_page((unsigned long)pgd); | 271 | free_page((unsigned long)pgd); |
234 | } | 272 | } |
235 | 273 | ||
@@ -255,7 +293,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, | |||
255 | 293 | ||
256 | if (pte_young(*ptep)) | 294 | if (pte_young(*ptep)) |
257 | ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, | 295 | ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, |
258 | &ptep->pte); | 296 | (unsigned long *) &ptep->pte); |
259 | 297 | ||
260 | if (ret) | 298 | if (ret) |
261 | pte_update(vma->vm_mm, addr, ptep); | 299 | pte_update(vma->vm_mm, addr, ptep); |
@@ -274,3 +312,22 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, | |||
274 | 312 | ||
275 | return young; | 313 | return young; |
276 | } | 314 | } |
315 | |||
316 | int fixmaps_set; | ||
317 | |||
318 | void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) | ||
319 | { | ||
320 | unsigned long address = __fix_to_virt(idx); | ||
321 | |||
322 | if (idx >= __end_of_fixed_addresses) { | ||
323 | BUG(); | ||
324 | return; | ||
325 | } | ||
326 | set_pte_vaddr(address, pte); | ||
327 | fixmaps_set++; | ||
328 | } | ||
329 | |||
330 | void native_set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t flags) | ||
331 | { | ||
332 | __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); | ||
333 | } | ||
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 369cf065b6a4..0951db9ee519 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c | |||
@@ -20,58 +20,11 @@ | |||
20 | #include <asm/tlb.h> | 20 | #include <asm/tlb.h> |
21 | #include <asm/tlbflush.h> | 21 | #include <asm/tlbflush.h> |
22 | 22 | ||
23 | void show_mem(void) | ||
24 | { | ||
25 | int total = 0, reserved = 0; | ||
26 | int shared = 0, cached = 0; | ||
27 | int highmem = 0; | ||
28 | struct page *page; | ||
29 | pg_data_t *pgdat; | ||
30 | unsigned long i; | ||
31 | unsigned long flags; | ||
32 | |||
33 | printk(KERN_INFO "Mem-info:\n"); | ||
34 | show_free_areas(); | ||
35 | for_each_online_pgdat(pgdat) { | ||
36 | pgdat_resize_lock(pgdat, &flags); | ||
37 | for (i = 0; i < pgdat->node_spanned_pages; ++i) { | ||
38 | if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) | ||
39 | touch_nmi_watchdog(); | ||
40 | page = pgdat_page_nr(pgdat, i); | ||
41 | total++; | ||
42 | if (PageHighMem(page)) | ||
43 | highmem++; | ||
44 | if (PageReserved(page)) | ||
45 | reserved++; | ||
46 | else if (PageSwapCache(page)) | ||
47 | cached++; | ||
48 | else if (page_count(page)) | ||
49 | shared += page_count(page) - 1; | ||
50 | } | ||
51 | pgdat_resize_unlock(pgdat, &flags); | ||
52 | } | ||
53 | printk(KERN_INFO "%d pages of RAM\n", total); | ||
54 | printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); | ||
55 | printk(KERN_INFO "%d reserved pages\n", reserved); | ||
56 | printk(KERN_INFO "%d pages shared\n", shared); | ||
57 | printk(KERN_INFO "%d pages swap cached\n", cached); | ||
58 | |||
59 | printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY)); | ||
60 | printk(KERN_INFO "%lu pages writeback\n", | ||
61 | global_page_state(NR_WRITEBACK)); | ||
62 | printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED)); | ||
63 | printk(KERN_INFO "%lu pages slab\n", | ||
64 | global_page_state(NR_SLAB_RECLAIMABLE) + | ||
65 | global_page_state(NR_SLAB_UNRECLAIMABLE)); | ||
66 | printk(KERN_INFO "%lu pages pagetables\n", | ||
67 | global_page_state(NR_PAGETABLE)); | ||
68 | } | ||
69 | |||
70 | /* | 23 | /* |
71 | * Associate a virtual page frame with a given physical page frame | 24 | * Associate a virtual page frame with a given physical page frame |
72 | * and protection flags for that frame. | 25 | * and protection flags for that frame. |
73 | */ | 26 | */ |
74 | static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) | 27 | void set_pte_vaddr(unsigned long vaddr, pte_t pteval) |
75 | { | 28 | { |
76 | pgd_t *pgd; | 29 | pgd_t *pgd; |
77 | pud_t *pud; | 30 | pud_t *pud; |
@@ -94,8 +47,8 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) | |||
94 | return; | 47 | return; |
95 | } | 48 | } |
96 | pte = pte_offset_kernel(pmd, vaddr); | 49 | pte = pte_offset_kernel(pmd, vaddr); |
97 | if (pgprot_val(flags)) | 50 | if (pte_val(pteval)) |
98 | set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags)); | 51 | set_pte_present(&init_mm, vaddr, pte, pteval); |
99 | else | 52 | else |
100 | pte_clear(&init_mm, vaddr, pte); | 53 | pte_clear(&init_mm, vaddr, pte); |
101 | 54 | ||
@@ -141,22 +94,9 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) | |||
141 | __flush_tlb_one(vaddr); | 94 | __flush_tlb_one(vaddr); |
142 | } | 95 | } |
143 | 96 | ||
144 | static int fixmaps; | ||
145 | unsigned long __FIXADDR_TOP = 0xfffff000; | 97 | unsigned long __FIXADDR_TOP = 0xfffff000; |
146 | EXPORT_SYMBOL(__FIXADDR_TOP); | 98 | EXPORT_SYMBOL(__FIXADDR_TOP); |
147 | 99 | ||
148 | void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) | ||
149 | { | ||
150 | unsigned long address = __fix_to_virt(idx); | ||
151 | |||
152 | if (idx >= __end_of_fixed_addresses) { | ||
153 | BUG(); | ||
154 | return; | ||
155 | } | ||
156 | set_pte_pfn(address, phys >> PAGE_SHIFT, flags); | ||
157 | fixmaps++; | ||
158 | } | ||
159 | |||
160 | /** | 100 | /** |
161 | * reserve_top_address - reserves a hole in the top of kernel address space | 101 | * reserve_top_address - reserves a hole in the top of kernel address space |
162 | * @reserve - size of hole to reserve | 102 | * @reserve - size of hole to reserve |
@@ -164,11 +104,45 @@ void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) | |||
164 | * Can be used to relocate the fixmap area and poke a hole in the top | 104 | * Can be used to relocate the fixmap area and poke a hole in the top |
165 | * of kernel address space to make room for a hypervisor. | 105 | * of kernel address space to make room for a hypervisor. |
166 | */ | 106 | */ |
167 | void reserve_top_address(unsigned long reserve) | 107 | void __init reserve_top_address(unsigned long reserve) |
168 | { | 108 | { |
169 | BUG_ON(fixmaps > 0); | 109 | BUG_ON(fixmaps_set > 0); |
170 | printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", | 110 | printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", |
171 | (int)-reserve); | 111 | (int)-reserve); |
172 | __FIXADDR_TOP = -reserve - PAGE_SIZE; | 112 | __FIXADDR_TOP = -reserve - PAGE_SIZE; |
173 | __VMALLOC_RESERVE += reserve; | 113 | __VMALLOC_RESERVE += reserve; |
174 | } | 114 | } |
115 | |||
116 | /* | ||
117 | * vmalloc=size forces the vmalloc area to be exactly 'size' | ||
118 | * bytes. This can be used to increase (or decrease) the | ||
119 | * vmalloc area - the default is 128m. | ||
120 | */ | ||
121 | static int __init parse_vmalloc(char *arg) | ||
122 | { | ||
123 | if (!arg) | ||
124 | return -EINVAL; | ||
125 | |||
126 | /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/ | ||
127 | __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET; | ||
128 | return 0; | ||
129 | } | ||
130 | early_param("vmalloc", parse_vmalloc); | ||
131 | |||
132 | /* | ||
133 | * reservetop=size reserves a hole at the top of the kernel address space which | ||
134 | * a hypervisor can load into later. Needed for dynamically loaded hypervisors, | ||
135 | * so relocating the fixmap can be done before paging initialization. | ||
136 | */ | ||
137 | static int __init parse_reservetop(char *arg) | ||
138 | { | ||
139 | unsigned long address; | ||
140 | |||
141 | if (!arg) | ||
142 | return -EINVAL; | ||
143 | |||
144 | address = memparse(arg, &arg); | ||
145 | reserve_top_address(address); | ||
146 | return 0; | ||
147 | } | ||
148 | early_param("reservetop", parse_reservetop); | ||
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c new file mode 100644 index 000000000000..16ae70fc57e7 --- /dev/null +++ b/arch/x86/mm/srat_32.c | |||
@@ -0,0 +1,283 @@ | |||
1 | /* | ||
2 | * Some of the code in this file has been gleaned from the 64 bit | ||
3 | * discontigmem support code base. | ||
4 | * | ||
5 | * Copyright (C) 2002, IBM Corp. | ||
6 | * | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, but | ||
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
17 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
18 | * details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with this program; if not, write to the Free Software | ||
22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | * | ||
24 | * Send feedback to Pat Gaughen <gone@us.ibm.com> | ||
25 | */ | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/bootmem.h> | ||
28 | #include <linux/mmzone.h> | ||
29 | #include <linux/acpi.h> | ||
30 | #include <linux/nodemask.h> | ||
31 | #include <asm/srat.h> | ||
32 | #include <asm/topology.h> | ||
33 | #include <asm/smp.h> | ||
34 | #include <asm/e820.h> | ||
35 | |||
36 | /* | ||
37 | * proximity macros and definitions | ||
38 | */ | ||
39 | #define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ | ||
40 | #define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ | ||
41 | #define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) | ||
42 | #define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) | ||
43 | /* bitmap length; _PXM is at most 255 */ | ||
44 | #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) | ||
45 | static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ | ||
46 | |||
47 | #define MAX_CHUNKS_PER_NODE 3 | ||
48 | #define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) | ||
49 | struct node_memory_chunk_s { | ||
50 | unsigned long start_pfn; | ||
51 | unsigned long end_pfn; | ||
52 | u8 pxm; // proximity domain of node | ||
53 | u8 nid; // which cnode contains this chunk? | ||
54 | u8 bank; // which mem bank on this node | ||
55 | }; | ||
56 | static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; | ||
57 | |||
58 | static int __initdata num_memory_chunks; /* total number of memory chunks */ | ||
59 | static u8 __initdata apicid_to_pxm[MAX_APICID]; | ||
60 | |||
61 | int numa_off __initdata; | ||
62 | int acpi_numa __initdata; | ||
63 | |||
64 | static __init void bad_srat(void) | ||
65 | { | ||
66 | printk(KERN_ERR "SRAT: SRAT not used.\n"); | ||
67 | acpi_numa = -1; | ||
68 | num_memory_chunks = 0; | ||
69 | } | ||
70 | |||
71 | static __init inline int srat_disabled(void) | ||
72 | { | ||
73 | return numa_off || acpi_numa < 0; | ||
74 | } | ||
75 | |||
76 | /* Identify CPU proximity domains */ | ||
77 | void __init | ||
78 | acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) | ||
79 | { | ||
80 | if (srat_disabled()) | ||
81 | return; | ||
82 | if (cpu_affinity->header.length != | ||
83 | sizeof(struct acpi_srat_cpu_affinity)) { | ||
84 | bad_srat(); | ||
85 | return; | ||
86 | } | ||
87 | |||
88 | if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) | ||
89 | return; /* empty entry */ | ||
90 | |||
91 | /* mark this node as "seen" in node bitmap */ | ||
92 | BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); | ||
93 | |||
94 | apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; | ||
95 | |||
96 | printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", | ||
97 | cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * Identify memory proximity domains and hot-remove capabilities. | ||
102 | * Fill node memory chunk list structure. | ||
103 | */ | ||
104 | void __init | ||
105 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) | ||
106 | { | ||
107 | unsigned long long paddr, size; | ||
108 | unsigned long start_pfn, end_pfn; | ||
109 | u8 pxm; | ||
110 | struct node_memory_chunk_s *p, *q, *pend; | ||
111 | |||
112 | if (srat_disabled()) | ||
113 | return; | ||
114 | if (memory_affinity->header.length != | ||
115 | sizeof(struct acpi_srat_mem_affinity)) { | ||
116 | bad_srat(); | ||
117 | return; | ||
118 | } | ||
119 | |||
120 | if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) | ||
121 | return; /* empty entry */ | ||
122 | |||
123 | pxm = memory_affinity->proximity_domain & 0xff; | ||
124 | |||
125 | /* mark this node as "seen" in node bitmap */ | ||
126 | BMAP_SET(pxm_bitmap, pxm); | ||
127 | |||
128 | /* calculate info for memory chunk structure */ | ||
129 | paddr = memory_affinity->base_address; | ||
130 | size = memory_affinity->length; | ||
131 | |||
132 | start_pfn = paddr >> PAGE_SHIFT; | ||
133 | end_pfn = (paddr + size) >> PAGE_SHIFT; | ||
134 | |||
135 | |||
136 | if (num_memory_chunks >= MAXCHUNKS) { | ||
137 | printk(KERN_WARNING "Too many mem chunks in SRAT." | ||
138 | " Ignoring %lld MBytes at %llx\n", | ||
139 | size/(1024*1024), paddr); | ||
140 | return; | ||
141 | } | ||
142 | |||
143 | /* Insertion sort based on base address */ | ||
144 | pend = &node_memory_chunk[num_memory_chunks]; | ||
145 | for (p = &node_memory_chunk[0]; p < pend; p++) { | ||
146 | if (start_pfn < p->start_pfn) | ||
147 | break; | ||
148 | } | ||
149 | if (p < pend) { | ||
150 | for (q = pend; q >= p; q--) | ||
151 | *(q + 1) = *q; | ||
152 | } | ||
153 | p->start_pfn = start_pfn; | ||
154 | p->end_pfn = end_pfn; | ||
155 | p->pxm = pxm; | ||
156 | |||
157 | num_memory_chunks++; | ||
158 | |||
159 | printk(KERN_DEBUG "Memory range %08lx to %08lx" | ||
160 | " in proximity domain %02x %s\n", | ||
161 | start_pfn, end_pfn, | ||
162 | pxm, | ||
163 | ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? | ||
164 | "enabled and removable" : "enabled" ) ); | ||
165 | } | ||
166 | |||
167 | /* Callback for SLIT parsing */ | ||
168 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | ||
169 | { | ||
170 | } | ||
171 | |||
172 | void acpi_numa_arch_fixup(void) | ||
173 | { | ||
174 | } | ||
175 | /* | ||
176 | * The SRAT table always lists ascending addresses, so can always | ||
177 | * assume that the first "start" address that you see is the real | ||
178 | * start of the node, and that the current "end" address is after | ||
179 | * the previous one. | ||
180 | */ | ||
181 | static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) | ||
182 | { | ||
183 | /* | ||
184 | * Only add present memory as told by the e820. | ||
185 | * There is no guarantee from the SRAT that the memory it | ||
186 | * enumerates is present at boot time because it represents | ||
187 | * *possible* memory hotplug areas the same as normal RAM. | ||
188 | */ | ||
189 | if (memory_chunk->start_pfn >= max_pfn) { | ||
190 | printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", | ||
191 | memory_chunk->start_pfn, memory_chunk->end_pfn); | ||
192 | return -1; | ||
193 | } | ||
194 | if (memory_chunk->nid != nid) | ||
195 | return -1; | ||
196 | |||
197 | if (!node_has_online_mem(nid)) | ||
198 | node_start_pfn[nid] = memory_chunk->start_pfn; | ||
199 | |||
200 | if (node_start_pfn[nid] > memory_chunk->start_pfn) | ||
201 | node_start_pfn[nid] = memory_chunk->start_pfn; | ||
202 | |||
203 | if (node_end_pfn[nid] < memory_chunk->end_pfn) | ||
204 | node_end_pfn[nid] = memory_chunk->end_pfn; | ||
205 | |||
206 | return 0; | ||
207 | } | ||
208 | |||
209 | int __init get_memcfg_from_srat(void) | ||
210 | { | ||
211 | int i, j, nid; | ||
212 | |||
213 | |||
214 | if (srat_disabled()) | ||
215 | goto out_fail; | ||
216 | |||
217 | if (num_memory_chunks == 0) { | ||
218 | printk(KERN_WARNING | ||
219 | "could not finy any ACPI SRAT memory areas.\n"); | ||
220 | goto out_fail; | ||
221 | } | ||
222 | |||
223 | /* Calculate total number of nodes in system from PXM bitmap and create | ||
224 | * a set of sequential node IDs starting at zero. (ACPI doesn't seem | ||
225 | * to specify the range of _PXM values.) | ||
226 | */ | ||
227 | /* | ||
228 | * MCD - we no longer HAVE to number nodes sequentially. PXM domain | ||
229 | * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically | ||
230 | * 32, so we will continue numbering them in this manner until MAX_NUMNODES | ||
231 | * approaches MAX_PXM_DOMAINS for i386. | ||
232 | */ | ||
233 | nodes_clear(node_online_map); | ||
234 | for (i = 0; i < MAX_PXM_DOMAINS; i++) { | ||
235 | if (BMAP_TEST(pxm_bitmap, i)) { | ||
236 | int nid = acpi_map_pxm_to_node(i); | ||
237 | node_set_online(nid); | ||
238 | } | ||
239 | } | ||
240 | BUG_ON(num_online_nodes() == 0); | ||
241 | |||
242 | /* set cnode id in memory chunk structure */ | ||
243 | for (i = 0; i < num_memory_chunks; i++) | ||
244 | node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); | ||
245 | |||
246 | printk(KERN_DEBUG "pxm bitmap: "); | ||
247 | for (i = 0; i < sizeof(pxm_bitmap); i++) { | ||
248 | printk(KERN_CONT "%02x ", pxm_bitmap[i]); | ||
249 | } | ||
250 | printk(KERN_CONT "\n"); | ||
251 | printk(KERN_DEBUG "Number of logical nodes in system = %d\n", | ||
252 | num_online_nodes()); | ||
253 | printk(KERN_DEBUG "Number of memory chunks in system = %d\n", | ||
254 | num_memory_chunks); | ||
255 | |||
256 | for (i = 0; i < MAX_APICID; i++) | ||
257 | apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); | ||
258 | |||
259 | for (j = 0; j < num_memory_chunks; j++){ | ||
260 | struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; | ||
261 | printk(KERN_DEBUG | ||
262 | "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", | ||
263 | j, chunk->nid, chunk->start_pfn, chunk->end_pfn); | ||
264 | if (node_read_chunk(chunk->nid, chunk)) | ||
265 | continue; | ||
266 | |||
267 | e820_register_active_regions(chunk->nid, chunk->start_pfn, | ||
268 | min(chunk->end_pfn, max_pfn)); | ||
269 | } | ||
270 | |||
271 | for_each_online_node(nid) { | ||
272 | unsigned long start = node_start_pfn[nid]; | ||
273 | unsigned long end = min(node_end_pfn[nid], max_pfn); | ||
274 | |||
275 | memory_present(nid, start, end); | ||
276 | node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); | ||
277 | } | ||
278 | return 1; | ||
279 | out_fail: | ||
280 | printk(KERN_ERR "failed to get NUMA memory information from SRAT" | ||
281 | " table\n"); | ||
282 | return 0; | ||
283 | } | ||
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 99649dccad28..1b4763e26ea9 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c | |||
@@ -100,7 +100,19 @@ static __init inline int srat_disabled(void) | |||
100 | /* Callback for SLIT parsing */ | 100 | /* Callback for SLIT parsing */ |
101 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | 101 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) |
102 | { | 102 | { |
103 | acpi_slit = slit; | 103 | unsigned length; |
104 | unsigned long phys; | ||
105 | |||
106 | length = slit->header.length; | ||
107 | phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length, | ||
108 | PAGE_SIZE); | ||
109 | |||
110 | if (phys == -1L) | ||
111 | panic(" Can not save slit!\n"); | ||
112 | |||
113 | acpi_slit = __va(phys); | ||
114 | memcpy(acpi_slit, slit, length); | ||
115 | reserve_early(phys, phys + length, "ACPI SLIT"); | ||
104 | } | 116 | } |
105 | 117 | ||
106 | /* Callback for Proximity Domain -> LAPIC mapping */ | 118 | /* Callback for Proximity Domain -> LAPIC mapping */ |
@@ -299,7 +311,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) | |||
299 | pxmram = 0; | 311 | pxmram = 0; |
300 | } | 312 | } |
301 | 313 | ||
302 | e820ram = end_pfn - absent_pages_in_range(0, end_pfn); | 314 | e820ram = max_pfn - absent_pages_in_range(0, max_pfn); |
303 | /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ | 315 | /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ |
304 | if ((long)(e820ram - pxmram) >= 1*1024*1024) { | 316 | if ((long)(e820ram - pxmram) >= 1*1024*1024) { |
305 | printk(KERN_ERR | 317 | printk(KERN_ERR |
@@ -376,7 +388,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) | |||
376 | if (node == NUMA_NO_NODE) | 388 | if (node == NUMA_NO_NODE) |
377 | continue; | 389 | continue; |
378 | if (!node_isset(node, node_possible_map)) | 390 | if (!node_isset(node, node_possible_map)) |
379 | numa_set_node(i, NUMA_NO_NODE); | 391 | numa_clear_node(i); |
380 | } | 392 | } |
381 | numa_init_array(); | 393 | numa_init_array(); |
382 | return 0; | 394 | return 0; |
@@ -495,6 +507,7 @@ int __node_distance(int a, int b) | |||
495 | 507 | ||
496 | EXPORT_SYMBOL(__node_distance); | 508 | EXPORT_SYMBOL(__node_distance); |
497 | 509 | ||
510 | #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) | ||
498 | int memory_add_physaddr_to_nid(u64 start) | 511 | int memory_add_physaddr_to_nid(u64 start) |
499 | { | 512 | { |
500 | int i, ret = 0; | 513 | int i, ret = 0; |
@@ -506,4 +519,4 @@ int memory_add_physaddr_to_nid(u64 start) | |||
506 | return ret; | 519 | return ret; |
507 | } | 520 | } |
508 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | 521 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); |
509 | 522 | #endif | |
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c new file mode 100644 index 000000000000..d877c5b423ef --- /dev/null +++ b/arch/x86/mm/testmmiotrace.c | |||
@@ -0,0 +1,71 @@ | |||
1 | /* | ||
2 | * Written by Pekka Paalanen, 2008 <pq@iki.fi> | ||
3 | */ | ||
4 | #include <linux/module.h> | ||
5 | #include <linux/io.h> | ||
6 | |||
7 | #define MODULE_NAME "testmmiotrace" | ||
8 | |||
9 | static unsigned long mmio_address; | ||
10 | module_param(mmio_address, ulong, 0); | ||
11 | MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); | ||
12 | |||
13 | static void do_write_test(void __iomem *p) | ||
14 | { | ||
15 | unsigned int i; | ||
16 | for (i = 0; i < 256; i++) | ||
17 | iowrite8(i, p + i); | ||
18 | for (i = 1024; i < (5 * 1024); i += 2) | ||
19 | iowrite16(i * 12 + 7, p + i); | ||
20 | for (i = (5 * 1024); i < (16 * 1024); i += 4) | ||
21 | iowrite32(i * 212371 + 13, p + i); | ||
22 | } | ||
23 | |||
24 | static void do_read_test(void __iomem *p) | ||
25 | { | ||
26 | unsigned int i; | ||
27 | for (i = 0; i < 256; i++) | ||
28 | ioread8(p + i); | ||
29 | for (i = 1024; i < (5 * 1024); i += 2) | ||
30 | ioread16(p + i); | ||
31 | for (i = (5 * 1024); i < (16 * 1024); i += 4) | ||
32 | ioread32(p + i); | ||
33 | } | ||
34 | |||
35 | static void do_test(void) | ||
36 | { | ||
37 | void __iomem *p = ioremap_nocache(mmio_address, 0x4000); | ||
38 | if (!p) { | ||
39 | pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); | ||
40 | return; | ||
41 | } | ||
42 | do_write_test(p); | ||
43 | do_read_test(p); | ||
44 | iounmap(p); | ||
45 | } | ||
46 | |||
47 | static int __init init(void) | ||
48 | { | ||
49 | if (mmio_address == 0) { | ||
50 | pr_err(MODULE_NAME ": you have to use the module argument " | ||
51 | "mmio_address.\n"); | ||
52 | pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS" | ||
53 | " YOU REALLY KNOW WHAT YOU ARE DOING!\n"); | ||
54 | return -ENXIO; | ||
55 | } | ||
56 | |||
57 | pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx " | ||
58 | "in PCI address space, and writing " | ||
59 | "rubbish in there.\n", mmio_address); | ||
60 | do_test(); | ||
61 | return 0; | ||
62 | } | ||
63 | |||
64 | static void __exit cleanup(void) | ||
65 | { | ||
66 | pr_debug(MODULE_NAME ": unloaded.\n"); | ||
67 | } | ||
68 | |||
69 | module_init(init); | ||
70 | module_exit(cleanup); | ||
71 | MODULE_LICENSE("GPL"); | ||