diff options
author | Yinghai Lu <yinghai@kernel.org> | 2012-11-16 22:38:58 -0500 |
---|---|---|
committer | H. Peter Anvin <hpa@linux.intel.com> | 2012-11-17 14:59:19 -0500 |
commit | 8d57470d8f859635deffe3919d7d4867b488b85a (patch) | |
tree | 8576bb911f3f671952a6d2310b787e1ed55e4456 /arch/x86 | |
parent | f763ad1d3870abb811ec7520b4c1adc56471a3a4 (diff) |
x86, mm: setup page table in top-down
Get pgt_buf early from BRK, and use it to map PMD_SIZE from top at first.
Then use mapped pages to map more ranges below, and keep looping until
all pages get mapped.
alloc_low_page will use page from BRK at first, after that buffer is used
up, will use memblock to find and reserve pages for page table usage.
Introduce min_pfn_mapped to make sure find new pages from mapped ranges,
that will be updated when lower pages get mapped.
Also add step_size to make sure that don't try to map too big range with
limited mapped pages initially, and increase the step_size when we have
more mapped pages on hand.
We don't need to call pagetable_reserve anymore, reserve work is done
in alloc_low_page() directly.
At last we can get rid of calculation and find early pgt related code.
-v2: update to after fix_xen change,
also use MACRO for initial pgt_buf size and add comments with it.
-v3: skip big reserved range in memblock.reserved near end.
-v4: don't need fix_xen change now.
-v5: add changelog about moving about reserving pagetable to alloc_low_page.
Suggested-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1353123563-3103-22-git-send-email-yinghai@kernel.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/page_types.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable.h | 1 | ||||
-rw-r--r-- | arch/x86/kernel/setup.c | 3 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 210 | ||||
-rw-r--r-- | arch/x86/mm/init_32.c | 17 | ||||
-rw-r--r-- | arch/x86/mm/init_64.c | 17 |
6 files changed, 94 insertions, 155 deletions
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 54c97879195e..9f6f3e66e84d 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h | |||
@@ -45,6 +45,7 @@ extern int devmem_is_allowed(unsigned long pagenr); | |||
45 | 45 | ||
46 | extern unsigned long max_low_pfn_mapped; | 46 | extern unsigned long max_low_pfn_mapped; |
47 | extern unsigned long max_pfn_mapped; | 47 | extern unsigned long max_pfn_mapped; |
48 | extern unsigned long min_pfn_mapped; | ||
48 | 49 | ||
49 | static inline phys_addr_t get_max_mapped(void) | 50 | static inline phys_addr_t get_max_mapped(void) |
50 | { | 51 | { |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index dd1a88832d25..6991a3e1bf81 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -603,6 +603,7 @@ static inline int pgd_none(pgd_t pgd) | |||
603 | 603 | ||
604 | extern int direct_gbpages; | 604 | extern int direct_gbpages; |
605 | void init_mem_mapping(void); | 605 | void init_mem_mapping(void); |
606 | void early_alloc_pgt_buf(void); | ||
606 | 607 | ||
607 | /* local pte updates need not use xchg for locking */ | 608 | /* local pte updates need not use xchg for locking */ |
608 | static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) | 609 | static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 94f922a73c54..f7634092931b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -124,6 +124,7 @@ | |||
124 | */ | 124 | */ |
125 | unsigned long max_low_pfn_mapped; | 125 | unsigned long max_low_pfn_mapped; |
126 | unsigned long max_pfn_mapped; | 126 | unsigned long max_pfn_mapped; |
127 | unsigned long min_pfn_mapped; | ||
127 | 128 | ||
128 | #ifdef CONFIG_DMI | 129 | #ifdef CONFIG_DMI |
129 | RESERVE_BRK(dmi_alloc, 65536); | 130 | RESERVE_BRK(dmi_alloc, 65536); |
@@ -900,6 +901,8 @@ void __init setup_arch(char **cmdline_p) | |||
900 | 901 | ||
901 | reserve_ibft_region(); | 902 | reserve_ibft_region(); |
902 | 903 | ||
904 | early_alloc_pgt_buf(); | ||
905 | |||
903 | /* | 906 | /* |
904 | * Need to conclude brk, before memblock_x86_fill() | 907 | * Need to conclude brk, before memblock_x86_fill() |
905 | * it could use memblock_find_in_range, could overlap with | 908 | * it could use memblock_find_in_range, could overlap with |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index c688ea3887f2..2393d0099e7f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -21,6 +21,21 @@ unsigned long __initdata pgt_buf_start; | |||
21 | unsigned long __meminitdata pgt_buf_end; | 21 | unsigned long __meminitdata pgt_buf_end; |
22 | unsigned long __meminitdata pgt_buf_top; | 22 | unsigned long __meminitdata pgt_buf_top; |
23 | 23 | ||
24 | /* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */ | ||
25 | #define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE) | ||
26 | RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); | ||
27 | void __init early_alloc_pgt_buf(void) | ||
28 | { | ||
29 | unsigned long tables = INIT_PGT_BUF_SIZE; | ||
30 | phys_addr_t base; | ||
31 | |||
32 | base = __pa(extend_brk(tables, PAGE_SIZE)); | ||
33 | |||
34 | pgt_buf_start = base >> PAGE_SHIFT; | ||
35 | pgt_buf_end = pgt_buf_start; | ||
36 | pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); | ||
37 | } | ||
38 | |||
24 | int after_bootmem; | 39 | int after_bootmem; |
25 | 40 | ||
26 | int direct_gbpages | 41 | int direct_gbpages |
@@ -228,105 +243,6 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range, | |||
228 | return nr_range; | 243 | return nr_range; |
229 | } | 244 | } |
230 | 245 | ||
231 | /* | ||
232 | * First calculate space needed for kernel direct mapping page tables to cover | ||
233 | * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB | ||
234 | * pages. Then find enough contiguous space for those page tables. | ||
235 | */ | ||
236 | static unsigned long __init calculate_table_space_size(unsigned long start, unsigned long end) | ||
237 | { | ||
238 | int i; | ||
239 | unsigned long puds = 0, pmds = 0, ptes = 0, tables; | ||
240 | struct map_range mr[NR_RANGE_MR]; | ||
241 | int nr_range; | ||
242 | |||
243 | memset(mr, 0, sizeof(mr)); | ||
244 | nr_range = 0; | ||
245 | nr_range = split_mem_range(mr, nr_range, start, end); | ||
246 | |||
247 | for (i = 0; i < nr_range; i++) { | ||
248 | unsigned long range, extra; | ||
249 | |||
250 | range = mr[i].end - mr[i].start; | ||
251 | puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; | ||
252 | |||
253 | if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { | ||
254 | extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); | ||
255 | pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; | ||
256 | } else { | ||
257 | pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; | ||
258 | } | ||
259 | |||
260 | if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { | ||
261 | extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); | ||
262 | #ifdef CONFIG_X86_32 | ||
263 | extra += PMD_SIZE; | ||
264 | #endif | ||
265 | ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
266 | } else { | ||
267 | ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
268 | } | ||
269 | } | ||
270 | |||
271 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); | ||
272 | tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); | ||
273 | tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); | ||
274 | |||
275 | #ifdef CONFIG_X86_32 | ||
276 | /* for fixmap */ | ||
277 | tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); | ||
278 | #endif | ||
279 | |||
280 | return tables; | ||
281 | } | ||
282 | |||
283 | static unsigned long __init calculate_all_table_space_size(void) | ||
284 | { | ||
285 | unsigned long start_pfn, end_pfn; | ||
286 | unsigned long tables; | ||
287 | int i; | ||
288 | |||
289 | /* the ISA range is always mapped regardless of memory holes */ | ||
290 | tables = calculate_table_space_size(0, ISA_END_ADDRESS); | ||
291 | |||
292 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { | ||
293 | u64 start = start_pfn << PAGE_SHIFT; | ||
294 | u64 end = end_pfn << PAGE_SHIFT; | ||
295 | |||
296 | if (end <= ISA_END_ADDRESS) | ||
297 | continue; | ||
298 | |||
299 | if (start < ISA_END_ADDRESS) | ||
300 | start = ISA_END_ADDRESS; | ||
301 | #ifdef CONFIG_X86_32 | ||
302 | /* on 32 bit, we only map up to max_low_pfn */ | ||
303 | if ((start >> PAGE_SHIFT) >= max_low_pfn) | ||
304 | continue; | ||
305 | |||
306 | if ((end >> PAGE_SHIFT) > max_low_pfn) | ||
307 | end = max_low_pfn << PAGE_SHIFT; | ||
308 | #endif | ||
309 | tables += calculate_table_space_size(start, end); | ||
310 | } | ||
311 | |||
312 | return tables; | ||
313 | } | ||
314 | |||
315 | static void __init find_early_table_space(unsigned long start, | ||
316 | unsigned long good_end, | ||
317 | unsigned long tables) | ||
318 | { | ||
319 | phys_addr_t base; | ||
320 | |||
321 | base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); | ||
322 | if (!base) | ||
323 | panic("Cannot find space for the kernel page tables"); | ||
324 | |||
325 | pgt_buf_start = base >> PAGE_SHIFT; | ||
326 | pgt_buf_end = pgt_buf_start; | ||
327 | pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); | ||
328 | } | ||
329 | |||
330 | static struct range pfn_mapped[E820_X_MAX]; | 246 | static struct range pfn_mapped[E820_X_MAX]; |
331 | static int nr_pfn_mapped; | 247 | static int nr_pfn_mapped; |
332 | 248 | ||
@@ -391,17 +307,14 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
391 | } | 307 | } |
392 | 308 | ||
393 | /* | 309 | /* |
394 | * Iterate through E820 memory map and create direct mappings for only E820_RAM | 310 | * would have hole in the middle or ends, and only ram parts will be mapped. |
395 | * regions. We cannot simply create direct mappings for all pfns from | ||
396 | * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in | ||
397 | * high addresses that cannot be marked as UC by fixed/variable range MTRRs. | ||
398 | * Depending on the alignment of E820 ranges, this may possibly result in using | ||
399 | * smaller size (i.e. 4K instead of 2M or 1G) page tables. | ||
400 | */ | 311 | */ |
401 | static void __init init_range_memory_mapping(unsigned long range_start, | 312 | static unsigned long __init init_range_memory_mapping( |
313 | unsigned long range_start, | ||
402 | unsigned long range_end) | 314 | unsigned long range_end) |
403 | { | 315 | { |
404 | unsigned long start_pfn, end_pfn; | 316 | unsigned long start_pfn, end_pfn; |
317 | unsigned long mapped_ram_size = 0; | ||
405 | int i; | 318 | int i; |
406 | 319 | ||
407 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { | 320 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { |
@@ -421,71 +334,70 @@ static void __init init_range_memory_mapping(unsigned long range_start, | |||
421 | end = range_end; | 334 | end = range_end; |
422 | 335 | ||
423 | init_memory_mapping(start, end); | 336 | init_memory_mapping(start, end); |
337 | |||
338 | mapped_ram_size += end - start; | ||
424 | } | 339 | } |
340 | |||
341 | return mapped_ram_size; | ||
425 | } | 342 | } |
426 | 343 | ||
344 | /* (PUD_SHIFT-PMD_SHIFT)/2 */ | ||
345 | #define STEP_SIZE_SHIFT 5 | ||
427 | void __init init_mem_mapping(void) | 346 | void __init init_mem_mapping(void) |
428 | { | 347 | { |
429 | unsigned long tables, good_end, end; | 348 | unsigned long end, real_end, start, last_start; |
349 | unsigned long step_size; | ||
350 | unsigned long addr; | ||
351 | unsigned long mapped_ram_size = 0; | ||
352 | unsigned long new_mapped_ram_size; | ||
430 | 353 | ||
431 | probe_page_size_mask(); | 354 | probe_page_size_mask(); |
432 | 355 | ||
433 | /* | ||
434 | * Find space for the kernel direct mapping tables. | ||
435 | * | ||
436 | * Later we should allocate these tables in the local node of the | ||
437 | * memory mapped. Unfortunately this is done currently before the | ||
438 | * nodes are discovered. | ||
439 | */ | ||
440 | #ifdef CONFIG_X86_64 | 356 | #ifdef CONFIG_X86_64 |
441 | end = max_pfn << PAGE_SHIFT; | 357 | end = max_pfn << PAGE_SHIFT; |
442 | good_end = end; | ||
443 | #else | 358 | #else |
444 | end = max_low_pfn << PAGE_SHIFT; | 359 | end = max_low_pfn << PAGE_SHIFT; |
445 | good_end = max_pfn_mapped << PAGE_SHIFT; | ||
446 | #endif | 360 | #endif |
447 | tables = calculate_all_table_space_size(); | ||
448 | find_early_table_space(0, good_end, tables); | ||
449 | printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n", | ||
450 | end - 1, pgt_buf_start << PAGE_SHIFT, | ||
451 | (pgt_buf_top << PAGE_SHIFT) - 1); | ||
452 | 361 | ||
453 | max_pfn_mapped = 0; /* will get exact value next */ | ||
454 | /* the ISA range is always mapped regardless of memory holes */ | 362 | /* the ISA range is always mapped regardless of memory holes */ |
455 | init_memory_mapping(0, ISA_END_ADDRESS); | 363 | init_memory_mapping(0, ISA_END_ADDRESS); |
456 | init_range_memory_mapping(ISA_END_ADDRESS, end); | 364 | |
365 | /* xen has big range in reserved near end of ram, skip it at first */ | ||
366 | addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, | ||
367 | PAGE_SIZE); | ||
368 | real_end = addr + PMD_SIZE; | ||
369 | |||
370 | /* step_size need to be small so pgt_buf from BRK could cover it */ | ||
371 | step_size = PMD_SIZE; | ||
372 | max_pfn_mapped = 0; /* will get exact value next */ | ||
373 | min_pfn_mapped = real_end >> PAGE_SHIFT; | ||
374 | last_start = start = real_end; | ||
375 | while (last_start > ISA_END_ADDRESS) { | ||
376 | if (last_start > step_size) { | ||
377 | start = round_down(last_start - 1, step_size); | ||
378 | if (start < ISA_END_ADDRESS) | ||
379 | start = ISA_END_ADDRESS; | ||
380 | } else | ||
381 | start = ISA_END_ADDRESS; | ||
382 | new_mapped_ram_size = init_range_memory_mapping(start, | ||
383 | last_start); | ||
384 | last_start = start; | ||
385 | min_pfn_mapped = last_start >> PAGE_SHIFT; | ||
386 | /* only increase step_size after big range get mapped */ | ||
387 | if (new_mapped_ram_size > mapped_ram_size) | ||
388 | step_size <<= STEP_SIZE_SHIFT; | ||
389 | mapped_ram_size += new_mapped_ram_size; | ||
390 | } | ||
391 | |||
392 | if (real_end < end) | ||
393 | init_range_memory_mapping(real_end, end); | ||
394 | |||
457 | #ifdef CONFIG_X86_64 | 395 | #ifdef CONFIG_X86_64 |
458 | if (max_pfn > max_low_pfn) { | 396 | if (max_pfn > max_low_pfn) { |
459 | /* can we preseve max_low_pfn ?*/ | 397 | /* can we preseve max_low_pfn ?*/ |
460 | max_low_pfn = max_pfn; | 398 | max_low_pfn = max_pfn; |
461 | } | 399 | } |
462 | #endif | 400 | #endif |
463 | /* | ||
464 | * Reserve the kernel pagetable pages we used (pgt_buf_start - | ||
465 | * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) | ||
466 | * so that they can be reused for other purposes. | ||
467 | * | ||
468 | * On native it just means calling memblock_reserve, on Xen it also | ||
469 | * means marking RW the pagetable pages that we allocated before | ||
470 | * but that haven't been used. | ||
471 | * | ||
472 | * In fact on xen we mark RO the whole range pgt_buf_start - | ||
473 | * pgt_buf_top, because we have to make sure that when | ||
474 | * init_memory_mapping reaches the pagetable pages area, it maps | ||
475 | * RO all the pagetable pages, including the ones that are beyond | ||
476 | * pgt_buf_end at that time. | ||
477 | */ | ||
478 | if (pgt_buf_end > pgt_buf_start) { | ||
479 | printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] final\n", | ||
480 | end - 1, pgt_buf_start << PAGE_SHIFT, | ||
481 | (pgt_buf_end << PAGE_SHIFT) - 1); | ||
482 | x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), | ||
483 | PFN_PHYS(pgt_buf_end)); | ||
484 | } | ||
485 | |||
486 | /* stop the wrong using */ | ||
487 | pgt_buf_top = 0; | ||
488 | |||
489 | early_memtest(0, max_pfn_mapped << PAGE_SHIFT); | 401 | early_memtest(0, max_pfn_mapped << PAGE_SHIFT); |
490 | } | 402 | } |
491 | 403 | ||
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 27f7fc69cf8a..7bb11064a9e1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -61,11 +61,22 @@ bool __read_mostly __vmalloc_start_set = false; | |||
61 | 61 | ||
62 | static __init void *alloc_low_page(void) | 62 | static __init void *alloc_low_page(void) |
63 | { | 63 | { |
64 | unsigned long pfn = pgt_buf_end++; | 64 | unsigned long pfn; |
65 | void *adr; | 65 | void *adr; |
66 | 66 | ||
67 | if (pfn >= pgt_buf_top) | 67 | if ((pgt_buf_end + 1) >= pgt_buf_top) { |
68 | panic("alloc_low_page: ran out of memory"); | 68 | unsigned long ret; |
69 | if (min_pfn_mapped >= max_pfn_mapped) | ||
70 | panic("alloc_low_page: ran out of memory"); | ||
71 | ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, | ||
72 | max_pfn_mapped << PAGE_SHIFT, | ||
73 | PAGE_SIZE, PAGE_SIZE); | ||
74 | if (!ret) | ||
75 | panic("alloc_low_page: can not alloc memory"); | ||
76 | memblock_reserve(ret, PAGE_SIZE); | ||
77 | pfn = ret >> PAGE_SHIFT; | ||
78 | } else | ||
79 | pfn = pgt_buf_end++; | ||
69 | 80 | ||
70 | adr = __va(pfn * PAGE_SIZE); | 81 | adr = __va(pfn * PAGE_SIZE); |
71 | clear_page(adr); | 82 | clear_page(adr); |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index fa28e3e29741..eefaea637b3d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -316,7 +316,7 @@ void __init cleanup_highmap(void) | |||
316 | 316 | ||
317 | static __ref void *alloc_low_page(unsigned long *phys) | 317 | static __ref void *alloc_low_page(unsigned long *phys) |
318 | { | 318 | { |
319 | unsigned long pfn = pgt_buf_end++; | 319 | unsigned long pfn; |
320 | void *adr; | 320 | void *adr; |
321 | 321 | ||
322 | if (after_bootmem) { | 322 | if (after_bootmem) { |
@@ -326,8 +326,19 @@ static __ref void *alloc_low_page(unsigned long *phys) | |||
326 | return adr; | 326 | return adr; |
327 | } | 327 | } |
328 | 328 | ||
329 | if (pfn >= pgt_buf_top) | 329 | if ((pgt_buf_end + 1) >= pgt_buf_top) { |
330 | panic("alloc_low_page: ran out of memory"); | 330 | unsigned long ret; |
331 | if (min_pfn_mapped >= max_pfn_mapped) | ||
332 | panic("alloc_low_page: ran out of memory"); | ||
333 | ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, | ||
334 | max_pfn_mapped << PAGE_SHIFT, | ||
335 | PAGE_SIZE, PAGE_SIZE); | ||
336 | if (!ret) | ||
337 | panic("alloc_low_page: can not alloc memory"); | ||
338 | memblock_reserve(ret, PAGE_SIZE); | ||
339 | pfn = ret >> PAGE_SHIFT; | ||
340 | } else | ||
341 | pfn = pgt_buf_end++; | ||
331 | 342 | ||
332 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); | 343 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); |
333 | clear_page(adr); | 344 | clear_page(adr); |