aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorYinghai Lu <yinghai@kernel.org>2012-11-16 22:38:58 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2012-11-17 14:59:19 -0500
commit8d57470d8f859635deffe3919d7d4867b488b85a (patch)
tree8576bb911f3f671952a6d2310b787e1ed55e4456 /arch/x86
parentf763ad1d3870abb811ec7520b4c1adc56471a3a4 (diff)
x86, mm: setup page table in top-down
Get pgt_buf early from BRK, and use it to map PMD_SIZE from top at first. Then use mapped pages to map more ranges below, and keep looping until all pages get mapped. alloc_low_page will use page from BRK at first, after that buffer is used up, will use memblock to find and reserve pages for page table usage. Introduce min_pfn_mapped to make sure find new pages from mapped ranges, that will be updated when lower pages get mapped. Also add step_size to make sure that don't try to map too big range with limited mapped pages initially, and increase the step_size when we have more mapped pages on hand. We don't need to call pagetable_reserve anymore, reserve work is done in alloc_low_page() directly. At last we can get rid of calculation and find early pgt related code. -v2: update to after fix_xen change, also use MACRO for initial pgt_buf size and add comments with it. -v3: skip big reserved range in memblock.reserved near end. -v4: don't need fix_xen change now. -v5: add changelog about moving about reserving pagetable to alloc_low_page. Suggested-by: "H. Peter Anvin" <hpa@zytor.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Link: http://lkml.kernel.org/r/1353123563-3103-22-git-send-email-yinghai@kernel.org Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/page_types.h1
-rw-r--r--arch/x86/include/asm/pgtable.h1
-rw-r--r--arch/x86/kernel/setup.c3
-rw-r--r--arch/x86/mm/init.c210
-rw-r--r--arch/x86/mm/init_32.c17
-rw-r--r--arch/x86/mm/init_64.c17
6 files changed, 94 insertions, 155 deletions
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 54c97879195e..9f6f3e66e84d 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -45,6 +45,7 @@ extern int devmem_is_allowed(unsigned long pagenr);
45 45
46extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
47extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
48extern unsigned long min_pfn_mapped;
48 49
49static inline phys_addr_t get_max_mapped(void) 50static inline phys_addr_t get_max_mapped(void)
50{ 51{
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index dd1a88832d25..6991a3e1bf81 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -603,6 +603,7 @@ static inline int pgd_none(pgd_t pgd)
603 603
604extern int direct_gbpages; 604extern int direct_gbpages;
605void init_mem_mapping(void); 605void init_mem_mapping(void);
606void early_alloc_pgt_buf(void);
606 607
607/* local pte updates need not use xchg for locking */ 608/* local pte updates need not use xchg for locking */
608static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) 609static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 94f922a73c54..f7634092931b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -124,6 +124,7 @@
124 */ 124 */
125unsigned long max_low_pfn_mapped; 125unsigned long max_low_pfn_mapped;
126unsigned long max_pfn_mapped; 126unsigned long max_pfn_mapped;
127unsigned long min_pfn_mapped;
127 128
128#ifdef CONFIG_DMI 129#ifdef CONFIG_DMI
129RESERVE_BRK(dmi_alloc, 65536); 130RESERVE_BRK(dmi_alloc, 65536);
@@ -900,6 +901,8 @@ void __init setup_arch(char **cmdline_p)
900 901
901 reserve_ibft_region(); 902 reserve_ibft_region();
902 903
904 early_alloc_pgt_buf();
905
903 /* 906 /*
904 * Need to conclude brk, before memblock_x86_fill() 907 * Need to conclude brk, before memblock_x86_fill()
905 * it could use memblock_find_in_range, could overlap with 908 * it could use memblock_find_in_range, could overlap with
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c688ea3887f2..2393d0099e7f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -21,6 +21,21 @@ unsigned long __initdata pgt_buf_start;
21unsigned long __meminitdata pgt_buf_end; 21unsigned long __meminitdata pgt_buf_end;
22unsigned long __meminitdata pgt_buf_top; 22unsigned long __meminitdata pgt_buf_top;
23 23
24/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
25#define INIT_PGT_BUF_SIZE (5 * PAGE_SIZE)
26RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
27void __init early_alloc_pgt_buf(void)
28{
29 unsigned long tables = INIT_PGT_BUF_SIZE;
30 phys_addr_t base;
31
32 base = __pa(extend_brk(tables, PAGE_SIZE));
33
34 pgt_buf_start = base >> PAGE_SHIFT;
35 pgt_buf_end = pgt_buf_start;
36 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
37}
38
24int after_bootmem; 39int after_bootmem;
25 40
26int direct_gbpages 41int direct_gbpages
@@ -228,105 +243,6 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range,
228 return nr_range; 243 return nr_range;
229} 244}
230 245
231/*
232 * First calculate space needed for kernel direct mapping page tables to cover
233 * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB
234 * pages. Then find enough contiguous space for those page tables.
235 */
236static unsigned long __init calculate_table_space_size(unsigned long start, unsigned long end)
237{
238 int i;
239 unsigned long puds = 0, pmds = 0, ptes = 0, tables;
240 struct map_range mr[NR_RANGE_MR];
241 int nr_range;
242
243 memset(mr, 0, sizeof(mr));
244 nr_range = 0;
245 nr_range = split_mem_range(mr, nr_range, start, end);
246
247 for (i = 0; i < nr_range; i++) {
248 unsigned long range, extra;
249
250 range = mr[i].end - mr[i].start;
251 puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
252
253 if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
254 extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
255 pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
256 } else {
257 pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
258 }
259
260 if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
261 extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
262#ifdef CONFIG_X86_32
263 extra += PMD_SIZE;
264#endif
265 ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
266 } else {
267 ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
268 }
269 }
270
271 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
272 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
273 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
274
275#ifdef CONFIG_X86_32
276 /* for fixmap */
277 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
278#endif
279
280 return tables;
281}
282
283static unsigned long __init calculate_all_table_space_size(void)
284{
285 unsigned long start_pfn, end_pfn;
286 unsigned long tables;
287 int i;
288
289 /* the ISA range is always mapped regardless of memory holes */
290 tables = calculate_table_space_size(0, ISA_END_ADDRESS);
291
292 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
293 u64 start = start_pfn << PAGE_SHIFT;
294 u64 end = end_pfn << PAGE_SHIFT;
295
296 if (end <= ISA_END_ADDRESS)
297 continue;
298
299 if (start < ISA_END_ADDRESS)
300 start = ISA_END_ADDRESS;
301#ifdef CONFIG_X86_32
302 /* on 32 bit, we only map up to max_low_pfn */
303 if ((start >> PAGE_SHIFT) >= max_low_pfn)
304 continue;
305
306 if ((end >> PAGE_SHIFT) > max_low_pfn)
307 end = max_low_pfn << PAGE_SHIFT;
308#endif
309 tables += calculate_table_space_size(start, end);
310 }
311
312 return tables;
313}
314
315static void __init find_early_table_space(unsigned long start,
316 unsigned long good_end,
317 unsigned long tables)
318{
319 phys_addr_t base;
320
321 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
322 if (!base)
323 panic("Cannot find space for the kernel page tables");
324
325 pgt_buf_start = base >> PAGE_SHIFT;
326 pgt_buf_end = pgt_buf_start;
327 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
328}
329
330static struct range pfn_mapped[E820_X_MAX]; 246static struct range pfn_mapped[E820_X_MAX];
331static int nr_pfn_mapped; 247static int nr_pfn_mapped;
332 248
@@ -391,17 +307,14 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
391} 307}
392 308
393/* 309/*
394 * Iterate through E820 memory map and create direct mappings for only E820_RAM 310 * would have hole in the middle or ends, and only ram parts will be mapped.
395 * regions. We cannot simply create direct mappings for all pfns from
396 * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in
397 * high addresses that cannot be marked as UC by fixed/variable range MTRRs.
398 * Depending on the alignment of E820 ranges, this may possibly result in using
399 * smaller size (i.e. 4K instead of 2M or 1G) page tables.
400 */ 311 */
401static void __init init_range_memory_mapping(unsigned long range_start, 312static unsigned long __init init_range_memory_mapping(
313 unsigned long range_start,
402 unsigned long range_end) 314 unsigned long range_end)
403{ 315{
404 unsigned long start_pfn, end_pfn; 316 unsigned long start_pfn, end_pfn;
317 unsigned long mapped_ram_size = 0;
405 int i; 318 int i;
406 319
407 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { 320 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
@@ -421,71 +334,70 @@ static void __init init_range_memory_mapping(unsigned long range_start,
421 end = range_end; 334 end = range_end;
422 335
423 init_memory_mapping(start, end); 336 init_memory_mapping(start, end);
337
338 mapped_ram_size += end - start;
424 } 339 }
340
341 return mapped_ram_size;
425} 342}
426 343
344/* (PUD_SHIFT-PMD_SHIFT)/2 */
345#define STEP_SIZE_SHIFT 5
427void __init init_mem_mapping(void) 346void __init init_mem_mapping(void)
428{ 347{
429 unsigned long tables, good_end, end; 348 unsigned long end, real_end, start, last_start;
349 unsigned long step_size;
350 unsigned long addr;
351 unsigned long mapped_ram_size = 0;
352 unsigned long new_mapped_ram_size;
430 353
431 probe_page_size_mask(); 354 probe_page_size_mask();
432 355
433 /*
434 * Find space for the kernel direct mapping tables.
435 *
436 * Later we should allocate these tables in the local node of the
437 * memory mapped. Unfortunately this is done currently before the
438 * nodes are discovered.
439 */
440#ifdef CONFIG_X86_64 356#ifdef CONFIG_X86_64
441 end = max_pfn << PAGE_SHIFT; 357 end = max_pfn << PAGE_SHIFT;
442 good_end = end;
443#else 358#else
444 end = max_low_pfn << PAGE_SHIFT; 359 end = max_low_pfn << PAGE_SHIFT;
445 good_end = max_pfn_mapped << PAGE_SHIFT;
446#endif 360#endif
447 tables = calculate_all_table_space_size();
448 find_early_table_space(0, good_end, tables);
449 printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n",
450 end - 1, pgt_buf_start << PAGE_SHIFT,
451 (pgt_buf_top << PAGE_SHIFT) - 1);
452 361
453 max_pfn_mapped = 0; /* will get exact value next */
454 /* the ISA range is always mapped regardless of memory holes */ 362 /* the ISA range is always mapped regardless of memory holes */
455 init_memory_mapping(0, ISA_END_ADDRESS); 363 init_memory_mapping(0, ISA_END_ADDRESS);
456 init_range_memory_mapping(ISA_END_ADDRESS, end); 364
365 /* xen has big range in reserved near end of ram, skip it at first */
366 addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE,
367 PAGE_SIZE);
368 real_end = addr + PMD_SIZE;
369
370 /* step_size need to be small so pgt_buf from BRK could cover it */
371 step_size = PMD_SIZE;
372 max_pfn_mapped = 0; /* will get exact value next */
373 min_pfn_mapped = real_end >> PAGE_SHIFT;
374 last_start = start = real_end;
375 while (last_start > ISA_END_ADDRESS) {
376 if (last_start > step_size) {
377 start = round_down(last_start - 1, step_size);
378 if (start < ISA_END_ADDRESS)
379 start = ISA_END_ADDRESS;
380 } else
381 start = ISA_END_ADDRESS;
382 new_mapped_ram_size = init_range_memory_mapping(start,
383 last_start);
384 last_start = start;
385 min_pfn_mapped = last_start >> PAGE_SHIFT;
386 /* only increase step_size after big range get mapped */
387 if (new_mapped_ram_size > mapped_ram_size)
388 step_size <<= STEP_SIZE_SHIFT;
389 mapped_ram_size += new_mapped_ram_size;
390 }
391
392 if (real_end < end)
393 init_range_memory_mapping(real_end, end);
394
457#ifdef CONFIG_X86_64 395#ifdef CONFIG_X86_64
458 if (max_pfn > max_low_pfn) { 396 if (max_pfn > max_low_pfn) {
459 /* can we preseve max_low_pfn ?*/ 397 /* can we preseve max_low_pfn ?*/
460 max_low_pfn = max_pfn; 398 max_low_pfn = max_pfn;
461 } 399 }
462#endif 400#endif
463 /*
464 * Reserve the kernel pagetable pages we used (pgt_buf_start -
465 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
466 * so that they can be reused for other purposes.
467 *
468 * On native it just means calling memblock_reserve, on Xen it also
469 * means marking RW the pagetable pages that we allocated before
470 * but that haven't been used.
471 *
472 * In fact on xen we mark RO the whole range pgt_buf_start -
473 * pgt_buf_top, because we have to make sure that when
474 * init_memory_mapping reaches the pagetable pages area, it maps
475 * RO all the pagetable pages, including the ones that are beyond
476 * pgt_buf_end at that time.
477 */
478 if (pgt_buf_end > pgt_buf_start) {
479 printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] final\n",
480 end - 1, pgt_buf_start << PAGE_SHIFT,
481 (pgt_buf_end << PAGE_SHIFT) - 1);
482 x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
483 PFN_PHYS(pgt_buf_end));
484 }
485
486 /* stop the wrong using */
487 pgt_buf_top = 0;
488
489 early_memtest(0, max_pfn_mapped << PAGE_SHIFT); 401 early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
490} 402}
491 403
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 27f7fc69cf8a..7bb11064a9e1 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -61,11 +61,22 @@ bool __read_mostly __vmalloc_start_set = false;
61 61
62static __init void *alloc_low_page(void) 62static __init void *alloc_low_page(void)
63{ 63{
64 unsigned long pfn = pgt_buf_end++; 64 unsigned long pfn;
65 void *adr; 65 void *adr;
66 66
67 if (pfn >= pgt_buf_top) 67 if ((pgt_buf_end + 1) >= pgt_buf_top) {
68 panic("alloc_low_page: ran out of memory"); 68 unsigned long ret;
69 if (min_pfn_mapped >= max_pfn_mapped)
70 panic("alloc_low_page: ran out of memory");
71 ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
72 max_pfn_mapped << PAGE_SHIFT,
73 PAGE_SIZE, PAGE_SIZE);
74 if (!ret)
75 panic("alloc_low_page: can not alloc memory");
76 memblock_reserve(ret, PAGE_SIZE);
77 pfn = ret >> PAGE_SHIFT;
78 } else
79 pfn = pgt_buf_end++;
69 80
70 adr = __va(pfn * PAGE_SIZE); 81 adr = __va(pfn * PAGE_SIZE);
71 clear_page(adr); 82 clear_page(adr);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index fa28e3e29741..eefaea637b3d 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -316,7 +316,7 @@ void __init cleanup_highmap(void)
316 316
317static __ref void *alloc_low_page(unsigned long *phys) 317static __ref void *alloc_low_page(unsigned long *phys)
318{ 318{
319 unsigned long pfn = pgt_buf_end++; 319 unsigned long pfn;
320 void *adr; 320 void *adr;
321 321
322 if (after_bootmem) { 322 if (after_bootmem) {
@@ -326,8 +326,19 @@ static __ref void *alloc_low_page(unsigned long *phys)
326 return adr; 326 return adr;
327 } 327 }
328 328
329 if (pfn >= pgt_buf_top) 329 if ((pgt_buf_end + 1) >= pgt_buf_top) {
330 panic("alloc_low_page: ran out of memory"); 330 unsigned long ret;
331 if (min_pfn_mapped >= max_pfn_mapped)
332 panic("alloc_low_page: ran out of memory");
333 ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
334 max_pfn_mapped << PAGE_SHIFT,
335 PAGE_SIZE, PAGE_SIZE);
336 if (!ret)
337 panic("alloc_low_page: can not alloc memory");
338 memblock_reserve(ret, PAGE_SIZE);
339 pfn = ret >> PAGE_SHIFT;
340 } else
341 pfn = pgt_buf_end++;
331 342
332 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 343 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
333 clear_page(adr); 344 clear_page(adr);