diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm/page_alloc.c | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 629 |
1 files changed, 433 insertions, 196 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f12ad1836abe..4e8985acdab8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/jiffies.h> | 22 | #include <linux/jiffies.h> |
23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
24 | #include <linux/memblock.h> | ||
24 | #include <linux/compiler.h> | 25 | #include <linux/compiler.h> |
25 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
26 | #include <linux/kmemcheck.h> | 27 | #include <linux/kmemcheck.h> |
@@ -29,6 +30,7 @@ | |||
29 | #include <linux/pagevec.h> | 30 | #include <linux/pagevec.h> |
30 | #include <linux/blkdev.h> | 31 | #include <linux/blkdev.h> |
31 | #include <linux/slab.h> | 32 | #include <linux/slab.h> |
33 | #include <linux/ratelimit.h> | ||
32 | #include <linux/oom.h> | 34 | #include <linux/oom.h> |
33 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
34 | #include <linux/topology.h> | 36 | #include <linux/topology.h> |
@@ -38,6 +40,7 @@ | |||
38 | #include <linux/memory_hotplug.h> | 40 | #include <linux/memory_hotplug.h> |
39 | #include <linux/nodemask.h> | 41 | #include <linux/nodemask.h> |
40 | #include <linux/vmalloc.h> | 42 | #include <linux/vmalloc.h> |
43 | #include <linux/vmstat.h> | ||
41 | #include <linux/mempolicy.h> | 44 | #include <linux/mempolicy.h> |
42 | #include <linux/stop_machine.h> | 45 | #include <linux/stop_machine.h> |
43 | #include <linux/sort.h> | 46 | #include <linux/sort.h> |
@@ -52,6 +55,8 @@ | |||
52 | #include <linux/compaction.h> | 55 | #include <linux/compaction.h> |
53 | #include <trace/events/kmem.h> | 56 | #include <trace/events/kmem.h> |
54 | #include <linux/ftrace_event.h> | 57 | #include <linux/ftrace_event.h> |
58 | #include <linux/memcontrol.h> | ||
59 | #include <linux/prefetch.h> | ||
55 | 60 | ||
56 | #include <asm/tlbflush.h> | 61 | #include <asm/tlbflush.h> |
57 | #include <asm/div64.h> | 62 | #include <asm/div64.h> |
@@ -103,19 +108,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | |||
103 | * only be modified with pm_mutex held, unless the suspend/hibernate code is | 108 | * only be modified with pm_mutex held, unless the suspend/hibernate code is |
104 | * guaranteed not to run in parallel with that modification). | 109 | * guaranteed not to run in parallel with that modification). |
105 | */ | 110 | */ |
106 | void set_gfp_allowed_mask(gfp_t mask) | 111 | |
112 | static gfp_t saved_gfp_mask; | ||
113 | |||
114 | void pm_restore_gfp_mask(void) | ||
107 | { | 115 | { |
108 | WARN_ON(!mutex_is_locked(&pm_mutex)); | 116 | WARN_ON(!mutex_is_locked(&pm_mutex)); |
109 | gfp_allowed_mask = mask; | 117 | if (saved_gfp_mask) { |
118 | gfp_allowed_mask = saved_gfp_mask; | ||
119 | saved_gfp_mask = 0; | ||
120 | } | ||
110 | } | 121 | } |
111 | 122 | ||
112 | gfp_t clear_gfp_allowed_mask(gfp_t mask) | 123 | void pm_restrict_gfp_mask(void) |
113 | { | 124 | { |
114 | gfp_t ret = gfp_allowed_mask; | ||
115 | |||
116 | WARN_ON(!mutex_is_locked(&pm_mutex)); | 125 | WARN_ON(!mutex_is_locked(&pm_mutex)); |
117 | gfp_allowed_mask &= ~mask; | 126 | WARN_ON(saved_gfp_mask); |
118 | return ret; | 127 | saved_gfp_mask = gfp_allowed_mask; |
128 | gfp_allowed_mask &= ~GFP_IOFS; | ||
119 | } | 129 | } |
120 | #endif /* CONFIG_PM_SLEEP */ | 130 | #endif /* CONFIG_PM_SLEEP */ |
121 | 131 | ||
@@ -280,7 +290,7 @@ static void bad_page(struct page *page) | |||
280 | 290 | ||
281 | /* Don't complain about poisoned pages */ | 291 | /* Don't complain about poisoned pages */ |
282 | if (PageHWPoison(page)) { | 292 | if (PageHWPoison(page)) { |
283 | __ClearPageBuddy(page); | 293 | reset_page_mapcount(page); /* remove PageBuddy */ |
284 | return; | 294 | return; |
285 | } | 295 | } |
286 | 296 | ||
@@ -311,7 +321,7 @@ static void bad_page(struct page *page) | |||
311 | dump_stack(); | 321 | dump_stack(); |
312 | out: | 322 | out: |
313 | /* Leave bad fields for debug, except PageBuddy could make trouble */ | 323 | /* Leave bad fields for debug, except PageBuddy could make trouble */ |
314 | __ClearPageBuddy(page); | 324 | reset_page_mapcount(page); /* remove PageBuddy */ |
315 | add_taint(TAINT_BAD_PAGE); | 325 | add_taint(TAINT_BAD_PAGE); |
316 | } | 326 | } |
317 | 327 | ||
@@ -351,6 +361,7 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
351 | } | 361 | } |
352 | } | 362 | } |
353 | 363 | ||
364 | /* update __split_huge_page_refcount if you change this function */ | ||
354 | static int destroy_compound_page(struct page *page, unsigned long order) | 365 | static int destroy_compound_page(struct page *page, unsigned long order) |
355 | { | 366 | { |
356 | int i; | 367 | int i; |
@@ -420,18 +431,10 @@ static inline void rmv_page_order(struct page *page) | |||
420 | * | 431 | * |
421 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER | 432 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER |
422 | */ | 433 | */ |
423 | static inline struct page * | ||
424 | __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) | ||
425 | { | ||
426 | unsigned long buddy_idx = page_idx ^ (1 << order); | ||
427 | |||
428 | return page + (buddy_idx - page_idx); | ||
429 | } | ||
430 | |||
431 | static inline unsigned long | 434 | static inline unsigned long |
432 | __find_combined_index(unsigned long page_idx, unsigned int order) | 435 | __find_buddy_index(unsigned long page_idx, unsigned int order) |
433 | { | 436 | { |
434 | return (page_idx & ~(1 << order)); | 437 | return page_idx ^ (1 << order); |
435 | } | 438 | } |
436 | 439 | ||
437 | /* | 440 | /* |
@@ -442,8 +445,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
442 | * (c) a page and its buddy have the same order && | 445 | * (c) a page and its buddy have the same order && |
443 | * (d) a page and its buddy are in the same zone. | 446 | * (d) a page and its buddy are in the same zone. |
444 | * | 447 | * |
445 | * For recording whether a page is in the buddy system, we use PG_buddy. | 448 | * For recording whether a page is in the buddy system, we set ->_mapcount -2. |
446 | * Setting, clearing, and testing PG_buddy is serialized by zone->lock. | 449 | * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. |
447 | * | 450 | * |
448 | * For recording page's order, we use page_private(page). | 451 | * For recording page's order, we use page_private(page). |
449 | */ | 452 | */ |
@@ -476,7 +479,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
476 | * as necessary, plus some accounting needed to play nicely with other | 479 | * as necessary, plus some accounting needed to play nicely with other |
477 | * parts of the VM system. | 480 | * parts of the VM system. |
478 | * At each level, we keep a list of pages, which are heads of continuous | 481 | * At each level, we keep a list of pages, which are heads of continuous |
479 | * free pages of length of (1 << order) and marked with PG_buddy. Page's | 482 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's |
480 | * order is recorded in page_private(page) field. | 483 | * order is recorded in page_private(page) field. |
481 | * So when we are allocating or freeing one, we can derive the state of the | 484 | * So when we are allocating or freeing one, we can derive the state of the |
482 | * other. That is, if we allocate a small block, and both were | 485 | * other. That is, if we allocate a small block, and both were |
@@ -493,6 +496,7 @@ static inline void __free_one_page(struct page *page, | |||
493 | { | 496 | { |
494 | unsigned long page_idx; | 497 | unsigned long page_idx; |
495 | unsigned long combined_idx; | 498 | unsigned long combined_idx; |
499 | unsigned long uninitialized_var(buddy_idx); | ||
496 | struct page *buddy; | 500 | struct page *buddy; |
497 | 501 | ||
498 | if (unlikely(PageCompound(page))) | 502 | if (unlikely(PageCompound(page))) |
@@ -507,7 +511,8 @@ static inline void __free_one_page(struct page *page, | |||
507 | VM_BUG_ON(bad_range(zone, page)); | 511 | VM_BUG_ON(bad_range(zone, page)); |
508 | 512 | ||
509 | while (order < MAX_ORDER-1) { | 513 | while (order < MAX_ORDER-1) { |
510 | buddy = __page_find_buddy(page, page_idx, order); | 514 | buddy_idx = __find_buddy_index(page_idx, order); |
515 | buddy = page + (buddy_idx - page_idx); | ||
511 | if (!page_is_buddy(page, buddy, order)) | 516 | if (!page_is_buddy(page, buddy, order)) |
512 | break; | 517 | break; |
513 | 518 | ||
@@ -515,7 +520,7 @@ static inline void __free_one_page(struct page *page, | |||
515 | list_del(&buddy->lru); | 520 | list_del(&buddy->lru); |
516 | zone->free_area[order].nr_free--; | 521 | zone->free_area[order].nr_free--; |
517 | rmv_page_order(buddy); | 522 | rmv_page_order(buddy); |
518 | combined_idx = __find_combined_index(page_idx, order); | 523 | combined_idx = buddy_idx & page_idx; |
519 | page = page + (combined_idx - page_idx); | 524 | page = page + (combined_idx - page_idx); |
520 | page_idx = combined_idx; | 525 | page_idx = combined_idx; |
521 | order++; | 526 | order++; |
@@ -530,11 +535,12 @@ static inline void __free_one_page(struct page *page, | |||
530 | * so it's less likely to be used soon and more likely to be merged | 535 | * so it's less likely to be used soon and more likely to be merged |
531 | * as a higher order page | 536 | * as a higher order page |
532 | */ | 537 | */ |
533 | if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { | 538 | if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { |
534 | struct page *higher_page, *higher_buddy; | 539 | struct page *higher_page, *higher_buddy; |
535 | combined_idx = __find_combined_index(page_idx, order); | 540 | combined_idx = buddy_idx & page_idx; |
536 | higher_page = page + combined_idx - page_idx; | 541 | higher_page = page + (combined_idx - page_idx); |
537 | higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); | 542 | buddy_idx = __find_buddy_index(combined_idx, order + 1); |
543 | higher_buddy = page + (buddy_idx - combined_idx); | ||
538 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { | 544 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { |
539 | list_add_tail(&page->lru, | 545 | list_add_tail(&page->lru, |
540 | &zone->free_area[order].free_list[migratetype]); | 546 | &zone->free_area[order].free_list[migratetype]); |
@@ -563,7 +569,8 @@ static inline int free_pages_check(struct page *page) | |||
563 | if (unlikely(page_mapcount(page) | | 569 | if (unlikely(page_mapcount(page) | |
564 | (page->mapping != NULL) | | 570 | (page->mapping != NULL) | |
565 | (atomic_read(&page->_count) != 0) | | 571 | (atomic_read(&page->_count) != 0) | |
566 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { | 572 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | |
573 | (mem_cgroup_bad_page_check(page)))) { | ||
567 | bad_page(page); | 574 | bad_page(page); |
568 | return 1; | 575 | return 1; |
569 | } | 576 | } |
@@ -612,6 +619,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
612 | list = &pcp->lists[migratetype]; | 619 | list = &pcp->lists[migratetype]; |
613 | } while (list_empty(list)); | 620 | } while (list_empty(list)); |
614 | 621 | ||
622 | /* This is the only non-empty list. Free them all. */ | ||
623 | if (batch_free == MIGRATE_PCPTYPES) | ||
624 | batch_free = to_free; | ||
625 | |||
615 | do { | 626 | do { |
616 | page = list_entry(list->prev, struct page, lru); | 627 | page = list_entry(list->prev, struct page, lru); |
617 | /* must delete as __free_one_page list manipulates */ | 628 | /* must delete as __free_one_page list manipulates */ |
@@ -645,13 +656,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
645 | trace_mm_page_free_direct(page, order); | 656 | trace_mm_page_free_direct(page, order); |
646 | kmemcheck_free_shadow(page, order); | 657 | kmemcheck_free_shadow(page, order); |
647 | 658 | ||
648 | for (i = 0; i < (1 << order); i++) { | 659 | if (PageAnon(page)) |
649 | struct page *pg = page + i; | 660 | page->mapping = NULL; |
650 | 661 | for (i = 0; i < (1 << order); i++) | |
651 | if (PageAnon(pg)) | 662 | bad += free_pages_check(page + i); |
652 | pg->mapping = NULL; | ||
653 | bad += free_pages_check(pg); | ||
654 | } | ||
655 | if (bad) | 663 | if (bad) |
656 | return false; | 664 | return false; |
657 | 665 | ||
@@ -751,7 +759,8 @@ static inline int check_new_page(struct page *page) | |||
751 | if (unlikely(page_mapcount(page) | | 759 | if (unlikely(page_mapcount(page) | |
752 | (page->mapping != NULL) | | 760 | (page->mapping != NULL) | |
753 | (atomic_read(&page->_count) != 0) | | 761 | (atomic_read(&page->_count) != 0) | |
754 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { | 762 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | |
763 | (mem_cgroup_bad_page_check(page)))) { | ||
755 | bad_page(page); | 764 | bad_page(page); |
756 | return 1; | 765 | return 1; |
757 | } | 766 | } |
@@ -864,9 +873,8 @@ static int move_freepages(struct zone *zone, | |||
864 | } | 873 | } |
865 | 874 | ||
866 | order = page_order(page); | 875 | order = page_order(page); |
867 | list_del(&page->lru); | 876 | list_move(&page->lru, |
868 | list_add(&page->lru, | 877 | &zone->free_area[order].free_list[migratetype]); |
869 | &zone->free_area[order].free_list[migratetype]); | ||
870 | page += 1 << order; | 878 | page += 1 << order; |
871 | pages_moved += 1 << order; | 879 | pages_moved += 1 << order; |
872 | } | 880 | } |
@@ -937,7 +945,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
937 | * If breaking a large block of pages, move all free | 945 | * If breaking a large block of pages, move all free |
938 | * pages to the preferred allocation list. If falling | 946 | * pages to the preferred allocation list. If falling |
939 | * back for a reclaimable kernel allocation, be more | 947 | * back for a reclaimable kernel allocation, be more |
940 | * agressive about taking ownership of free pages | 948 | * aggressive about taking ownership of free pages |
941 | */ | 949 | */ |
942 | if (unlikely(current_order >= (pageblock_order >> 1)) || | 950 | if (unlikely(current_order >= (pageblock_order >> 1)) || |
943 | start_migratetype == MIGRATE_RECLAIMABLE || | 951 | start_migratetype == MIGRATE_RECLAIMABLE || |
@@ -1089,8 +1097,10 @@ static void drain_pages(unsigned int cpu) | |||
1089 | pset = per_cpu_ptr(zone->pageset, cpu); | 1097 | pset = per_cpu_ptr(zone->pageset, cpu); |
1090 | 1098 | ||
1091 | pcp = &pset->pcp; | 1099 | pcp = &pset->pcp; |
1092 | free_pcppages_bulk(zone, pcp->count, pcp); | 1100 | if (pcp->count) { |
1093 | pcp->count = 0; | 1101 | free_pcppages_bulk(zone, pcp->count, pcp); |
1102 | pcp->count = 0; | ||
1103 | } | ||
1094 | local_irq_restore(flags); | 1104 | local_irq_restore(flags); |
1095 | } | 1105 | } |
1096 | } | 1106 | } |
@@ -1332,7 +1342,7 @@ again: | |||
1332 | } | 1342 | } |
1333 | 1343 | ||
1334 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1344 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1335 | zone_statistics(preferred_zone, zone); | 1345 | zone_statistics(preferred_zone, zone, gfp_flags); |
1336 | local_irq_restore(flags); | 1346 | local_irq_restore(flags); |
1337 | 1347 | ||
1338 | VM_BUG_ON(bad_range(zone, page)); | 1348 | VM_BUG_ON(bad_range(zone, page)); |
@@ -1454,24 +1464,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | |||
1454 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1464 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
1455 | 1465 | ||
1456 | /* | 1466 | /* |
1457 | * Return 1 if free pages are above 'mark'. This takes into account the order | 1467 | * Return true if free pages are above 'mark'. This takes into account the order |
1458 | * of the allocation. | 1468 | * of the allocation. |
1459 | */ | 1469 | */ |
1460 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1470 | static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1461 | int classzone_idx, int alloc_flags) | 1471 | int classzone_idx, int alloc_flags, long free_pages) |
1462 | { | 1472 | { |
1463 | /* free_pages my go negative - that's OK */ | 1473 | /* free_pages my go negative - that's OK */ |
1464 | long min = mark; | 1474 | long min = mark; |
1465 | long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; | ||
1466 | int o; | 1475 | int o; |
1467 | 1476 | ||
1477 | free_pages -= (1 << order) + 1; | ||
1468 | if (alloc_flags & ALLOC_HIGH) | 1478 | if (alloc_flags & ALLOC_HIGH) |
1469 | min -= min / 2; | 1479 | min -= min / 2; |
1470 | if (alloc_flags & ALLOC_HARDER) | 1480 | if (alloc_flags & ALLOC_HARDER) |
1471 | min -= min / 4; | 1481 | min -= min / 4; |
1472 | 1482 | ||
1473 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 1483 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) |
1474 | return 0; | 1484 | return false; |
1475 | for (o = 0; o < order; o++) { | 1485 | for (o = 0; o < order; o++) { |
1476 | /* At the next order, this order's pages become unavailable */ | 1486 | /* At the next order, this order's pages become unavailable */ |
1477 | free_pages -= z->free_area[o].nr_free << o; | 1487 | free_pages -= z->free_area[o].nr_free << o; |
@@ -1480,9 +1490,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1480 | min >>= 1; | 1490 | min >>= 1; |
1481 | 1491 | ||
1482 | if (free_pages <= min) | 1492 | if (free_pages <= min) |
1483 | return 0; | 1493 | return false; |
1484 | } | 1494 | } |
1485 | return 1; | 1495 | return true; |
1496 | } | ||
1497 | |||
1498 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | ||
1499 | int classzone_idx, int alloc_flags) | ||
1500 | { | ||
1501 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | ||
1502 | zone_page_state(z, NR_FREE_PAGES)); | ||
1503 | } | ||
1504 | |||
1505 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | ||
1506 | int classzone_idx, int alloc_flags) | ||
1507 | { | ||
1508 | long free_pages = zone_page_state(z, NR_FREE_PAGES); | ||
1509 | |||
1510 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | ||
1511 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | ||
1512 | |||
1513 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | ||
1514 | free_pages); | ||
1486 | } | 1515 | } |
1487 | 1516 | ||
1488 | #ifdef CONFIG_NUMA | 1517 | #ifdef CONFIG_NUMA |
@@ -1694,6 +1723,59 @@ try_next_zone: | |||
1694 | return page; | 1723 | return page; |
1695 | } | 1724 | } |
1696 | 1725 | ||
1726 | /* | ||
1727 | * Large machines with many possible nodes should not always dump per-node | ||
1728 | * meminfo in irq context. | ||
1729 | */ | ||
1730 | static inline bool should_suppress_show_mem(void) | ||
1731 | { | ||
1732 | bool ret = false; | ||
1733 | |||
1734 | #if NODES_SHIFT > 8 | ||
1735 | ret = in_interrupt(); | ||
1736 | #endif | ||
1737 | return ret; | ||
1738 | } | ||
1739 | |||
1740 | static DEFINE_RATELIMIT_STATE(nopage_rs, | ||
1741 | DEFAULT_RATELIMIT_INTERVAL, | ||
1742 | DEFAULT_RATELIMIT_BURST); | ||
1743 | |||
1744 | void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | ||
1745 | { | ||
1746 | va_list args; | ||
1747 | unsigned int filter = SHOW_MEM_FILTER_NODES; | ||
1748 | |||
1749 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) | ||
1750 | return; | ||
1751 | |||
1752 | /* | ||
1753 | * This documents exceptions given to allocations in certain | ||
1754 | * contexts that are allowed to allocate outside current's set | ||
1755 | * of allowed nodes. | ||
1756 | */ | ||
1757 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | ||
1758 | if (test_thread_flag(TIF_MEMDIE) || | ||
1759 | (current->flags & (PF_MEMALLOC | PF_EXITING))) | ||
1760 | filter &= ~SHOW_MEM_FILTER_NODES; | ||
1761 | if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) | ||
1762 | filter &= ~SHOW_MEM_FILTER_NODES; | ||
1763 | |||
1764 | if (fmt) { | ||
1765 | printk(KERN_WARNING); | ||
1766 | va_start(args, fmt); | ||
1767 | vprintk(fmt, args); | ||
1768 | va_end(args); | ||
1769 | } | ||
1770 | |||
1771 | pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n", | ||
1772 | current->comm, order, gfp_mask); | ||
1773 | |||
1774 | dump_stack(); | ||
1775 | if (!should_suppress_show_mem()) | ||
1776 | show_mem(filter); | ||
1777 | } | ||
1778 | |||
1697 | static inline int | 1779 | static inline int |
1698 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, | 1780 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, |
1699 | unsigned long pages_reclaimed) | 1781 | unsigned long pages_reclaimed) |
@@ -1787,15 +1869,18 @@ static struct page * | |||
1787 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 1869 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
1788 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 1870 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1789 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 1871 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
1790 | int migratetype, unsigned long *did_some_progress) | 1872 | int migratetype, unsigned long *did_some_progress, |
1873 | bool sync_migration) | ||
1791 | { | 1874 | { |
1792 | struct page *page; | 1875 | struct page *page; |
1793 | 1876 | ||
1794 | if (!order || compaction_deferred(preferred_zone)) | 1877 | if (!order || compaction_deferred(preferred_zone)) |
1795 | return NULL; | 1878 | return NULL; |
1796 | 1879 | ||
1880 | current->flags |= PF_MEMALLOC; | ||
1797 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 1881 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
1798 | nodemask); | 1882 | nodemask, sync_migration); |
1883 | current->flags &= ~PF_MEMALLOC; | ||
1799 | if (*did_some_progress != COMPACT_SKIPPED) { | 1884 | if (*did_some_progress != COMPACT_SKIPPED) { |
1800 | 1885 | ||
1801 | /* Page migration frees to the PCP lists but we want merging */ | 1886 | /* Page migration frees to the PCP lists but we want merging */ |
@@ -1831,7 +1916,8 @@ static inline struct page * | |||
1831 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 1916 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
1832 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 1917 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1833 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 1918 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
1834 | int migratetype, unsigned long *did_some_progress) | 1919 | int migratetype, unsigned long *did_some_progress, |
1920 | bool sync_migration) | ||
1835 | { | 1921 | { |
1836 | return NULL; | 1922 | return NULL; |
1837 | } | 1923 | } |
@@ -1846,23 +1932,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
1846 | { | 1932 | { |
1847 | struct page *page = NULL; | 1933 | struct page *page = NULL; |
1848 | struct reclaim_state reclaim_state; | 1934 | struct reclaim_state reclaim_state; |
1849 | struct task_struct *p = current; | ||
1850 | bool drained = false; | 1935 | bool drained = false; |
1851 | 1936 | ||
1852 | cond_resched(); | 1937 | cond_resched(); |
1853 | 1938 | ||
1854 | /* We now go into synchronous reclaim */ | 1939 | /* We now go into synchronous reclaim */ |
1855 | cpuset_memory_pressure_bump(); | 1940 | cpuset_memory_pressure_bump(); |
1856 | p->flags |= PF_MEMALLOC; | 1941 | current->flags |= PF_MEMALLOC; |
1857 | lockdep_set_current_reclaim_state(gfp_mask); | 1942 | lockdep_set_current_reclaim_state(gfp_mask); |
1858 | reclaim_state.reclaimed_slab = 0; | 1943 | reclaim_state.reclaimed_slab = 0; |
1859 | p->reclaim_state = &reclaim_state; | 1944 | current->reclaim_state = &reclaim_state; |
1860 | 1945 | ||
1861 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 1946 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); |
1862 | 1947 | ||
1863 | p->reclaim_state = NULL; | 1948 | current->reclaim_state = NULL; |
1864 | lockdep_clear_current_reclaim_state(); | 1949 | lockdep_clear_current_reclaim_state(); |
1865 | p->flags &= ~PF_MEMALLOC; | 1950 | current->flags &= ~PF_MEMALLOC; |
1866 | 1951 | ||
1867 | cond_resched(); | 1952 | cond_resched(); |
1868 | 1953 | ||
@@ -1906,7 +1991,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
1906 | preferred_zone, migratetype); | 1991 | preferred_zone, migratetype); |
1907 | 1992 | ||
1908 | if (!page && gfp_mask & __GFP_NOFAIL) | 1993 | if (!page && gfp_mask & __GFP_NOFAIL) |
1909 | congestion_wait(BLK_RW_ASYNC, HZ/50); | 1994 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
1910 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | 1995 | } while (!page && (gfp_mask & __GFP_NOFAIL)); |
1911 | 1996 | ||
1912 | return page; | 1997 | return page; |
@@ -1914,24 +1999,24 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
1914 | 1999 | ||
1915 | static inline | 2000 | static inline |
1916 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, | 2001 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, |
1917 | enum zone_type high_zoneidx) | 2002 | enum zone_type high_zoneidx, |
2003 | enum zone_type classzone_idx) | ||
1918 | { | 2004 | { |
1919 | struct zoneref *z; | 2005 | struct zoneref *z; |
1920 | struct zone *zone; | 2006 | struct zone *zone; |
1921 | 2007 | ||
1922 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 2008 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
1923 | wakeup_kswapd(zone, order); | 2009 | wakeup_kswapd(zone, order, classzone_idx); |
1924 | } | 2010 | } |
1925 | 2011 | ||
1926 | static inline int | 2012 | static inline int |
1927 | gfp_to_alloc_flags(gfp_t gfp_mask) | 2013 | gfp_to_alloc_flags(gfp_t gfp_mask) |
1928 | { | 2014 | { |
1929 | struct task_struct *p = current; | ||
1930 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; | 2015 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; |
1931 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 2016 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
1932 | 2017 | ||
1933 | /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ | 2018 | /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ |
1934 | BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); | 2019 | BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); |
1935 | 2020 | ||
1936 | /* | 2021 | /* |
1937 | * The caller may dip into page reserves a bit more if the caller | 2022 | * The caller may dip into page reserves a bit more if the caller |
@@ -1939,21 +2024,26 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
1939 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | 2024 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will |
1940 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | 2025 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). |
1941 | */ | 2026 | */ |
1942 | alloc_flags |= (gfp_mask & __GFP_HIGH); | 2027 | alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); |
1943 | 2028 | ||
1944 | if (!wait) { | 2029 | if (!wait) { |
1945 | alloc_flags |= ALLOC_HARDER; | 2030 | /* |
2031 | * Not worth trying to allocate harder for | ||
2032 | * __GFP_NOMEMALLOC even if it can't schedule. | ||
2033 | */ | ||
2034 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | ||
2035 | alloc_flags |= ALLOC_HARDER; | ||
1946 | /* | 2036 | /* |
1947 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 2037 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
1948 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 2038 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1949 | */ | 2039 | */ |
1950 | alloc_flags &= ~ALLOC_CPUSET; | 2040 | alloc_flags &= ~ALLOC_CPUSET; |
1951 | } else if (unlikely(rt_task(p)) && !in_interrupt()) | 2041 | } else if (unlikely(rt_task(current)) && !in_interrupt()) |
1952 | alloc_flags |= ALLOC_HARDER; | 2042 | alloc_flags |= ALLOC_HARDER; |
1953 | 2043 | ||
1954 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | 2044 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { |
1955 | if (!in_interrupt() && | 2045 | if (!in_interrupt() && |
1956 | ((p->flags & PF_MEMALLOC) || | 2046 | ((current->flags & PF_MEMALLOC) || |
1957 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 2047 | unlikely(test_thread_flag(TIF_MEMDIE)))) |
1958 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2048 | alloc_flags |= ALLOC_NO_WATERMARKS; |
1959 | } | 2049 | } |
@@ -1972,7 +2062,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
1972 | int alloc_flags; | 2062 | int alloc_flags; |
1973 | unsigned long pages_reclaimed = 0; | 2063 | unsigned long pages_reclaimed = 0; |
1974 | unsigned long did_some_progress; | 2064 | unsigned long did_some_progress; |
1975 | struct task_struct *p = current; | 2065 | bool sync_migration = false; |
1976 | 2066 | ||
1977 | /* | 2067 | /* |
1978 | * In the slowpath, we sanity check order to avoid ever trying to | 2068 | * In the slowpath, we sanity check order to avoid ever trying to |
@@ -1997,7 +2087,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
1997 | goto nopage; | 2087 | goto nopage; |
1998 | 2088 | ||
1999 | restart: | 2089 | restart: |
2000 | wake_all_kswapd(order, zonelist, high_zoneidx); | 2090 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2091 | wake_all_kswapd(order, zonelist, high_zoneidx, | ||
2092 | zone_idx(preferred_zone)); | ||
2001 | 2093 | ||
2002 | /* | 2094 | /* |
2003 | * OK, we're below the kswapd watermark and have kicked background | 2095 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2006,6 +2098,15 @@ restart: | |||
2006 | */ | 2098 | */ |
2007 | alloc_flags = gfp_to_alloc_flags(gfp_mask); | 2099 | alloc_flags = gfp_to_alloc_flags(gfp_mask); |
2008 | 2100 | ||
2101 | /* | ||
2102 | * Find the true preferred zone if the allocation is unconstrained by | ||
2103 | * cpusets. | ||
2104 | */ | ||
2105 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) | ||
2106 | first_zones_zonelist(zonelist, high_zoneidx, NULL, | ||
2107 | &preferred_zone); | ||
2108 | |||
2109 | rebalance: | ||
2009 | /* This is the last chance, in general, before the goto nopage. */ | 2110 | /* This is the last chance, in general, before the goto nopage. */ |
2010 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2111 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
2011 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2112 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
@@ -2013,7 +2114,6 @@ restart: | |||
2013 | if (page) | 2114 | if (page) |
2014 | goto got_pg; | 2115 | goto got_pg; |
2015 | 2116 | ||
2016 | rebalance: | ||
2017 | /* Allocate without watermarks if the context allows */ | 2117 | /* Allocate without watermarks if the context allows */ |
2018 | if (alloc_flags & ALLOC_NO_WATERMARKS) { | 2118 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
2019 | page = __alloc_pages_high_priority(gfp_mask, order, | 2119 | page = __alloc_pages_high_priority(gfp_mask, order, |
@@ -2028,21 +2128,26 @@ rebalance: | |||
2028 | goto nopage; | 2128 | goto nopage; |
2029 | 2129 | ||
2030 | /* Avoid recursion of direct reclaim */ | 2130 | /* Avoid recursion of direct reclaim */ |
2031 | if (p->flags & PF_MEMALLOC) | 2131 | if (current->flags & PF_MEMALLOC) |
2032 | goto nopage; | 2132 | goto nopage; |
2033 | 2133 | ||
2034 | /* Avoid allocations with no watermarks from looping endlessly */ | 2134 | /* Avoid allocations with no watermarks from looping endlessly */ |
2035 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) | 2135 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) |
2036 | goto nopage; | 2136 | goto nopage; |
2037 | 2137 | ||
2038 | /* Try direct compaction */ | 2138 | /* |
2139 | * Try direct compaction. The first pass is asynchronous. Subsequent | ||
2140 | * attempts after direct reclaim are synchronous | ||
2141 | */ | ||
2039 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2142 | page = __alloc_pages_direct_compact(gfp_mask, order, |
2040 | zonelist, high_zoneidx, | 2143 | zonelist, high_zoneidx, |
2041 | nodemask, | 2144 | nodemask, |
2042 | alloc_flags, preferred_zone, | 2145 | alloc_flags, preferred_zone, |
2043 | migratetype, &did_some_progress); | 2146 | migratetype, &did_some_progress, |
2147 | sync_migration); | ||
2044 | if (page) | 2148 | if (page) |
2045 | goto got_pg; | 2149 | goto got_pg; |
2150 | sync_migration = true; | ||
2046 | 2151 | ||
2047 | /* Try direct reclaim and then allocating */ | 2152 | /* Try direct reclaim and then allocating */ |
2048 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2153 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
@@ -2094,18 +2199,26 @@ rebalance: | |||
2094 | pages_reclaimed += did_some_progress; | 2199 | pages_reclaimed += did_some_progress; |
2095 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { | 2200 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { |
2096 | /* Wait for some write requests to complete then retry */ | 2201 | /* Wait for some write requests to complete then retry */ |
2097 | congestion_wait(BLK_RW_ASYNC, HZ/50); | 2202 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
2098 | goto rebalance; | 2203 | goto rebalance; |
2204 | } else { | ||
2205 | /* | ||
2206 | * High-order allocations do not necessarily loop after | ||
2207 | * direct reclaim and reclaim/compaction depends on compaction | ||
2208 | * being called after reclaim so call directly if necessary | ||
2209 | */ | ||
2210 | page = __alloc_pages_direct_compact(gfp_mask, order, | ||
2211 | zonelist, high_zoneidx, | ||
2212 | nodemask, | ||
2213 | alloc_flags, preferred_zone, | ||
2214 | migratetype, &did_some_progress, | ||
2215 | sync_migration); | ||
2216 | if (page) | ||
2217 | goto got_pg; | ||
2099 | } | 2218 | } |
2100 | 2219 | ||
2101 | nopage: | 2220 | nopage: |
2102 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { | 2221 | warn_alloc_failed(gfp_mask, order, NULL); |
2103 | printk(KERN_WARNING "%s: page allocation failure." | ||
2104 | " order:%d, mode:0x%x\n", | ||
2105 | p->comm, order, gfp_mask); | ||
2106 | dump_stack(); | ||
2107 | show_mem(); | ||
2108 | } | ||
2109 | return page; | 2222 | return page; |
2110 | got_pg: | 2223 | got_pg: |
2111 | if (kmemcheck_enabled) | 2224 | if (kmemcheck_enabled) |
@@ -2145,7 +2258,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2145 | 2258 | ||
2146 | get_mems_allowed(); | 2259 | get_mems_allowed(); |
2147 | /* The preferred zone is used for statistics later */ | 2260 | /* The preferred zone is used for statistics later */ |
2148 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); | 2261 | first_zones_zonelist(zonelist, high_zoneidx, |
2262 | nodemask ? : &cpuset_current_mems_allowed, | ||
2263 | &preferred_zone); | ||
2149 | if (!preferred_zone) { | 2264 | if (!preferred_zone) { |
2150 | put_mems_allowed(); | 2265 | put_mems_allowed(); |
2151 | return NULL; | 2266 | return NULL; |
@@ -2224,6 +2339,21 @@ void free_pages(unsigned long addr, unsigned int order) | |||
2224 | 2339 | ||
2225 | EXPORT_SYMBOL(free_pages); | 2340 | EXPORT_SYMBOL(free_pages); |
2226 | 2341 | ||
2342 | static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) | ||
2343 | { | ||
2344 | if (addr) { | ||
2345 | unsigned long alloc_end = addr + (PAGE_SIZE << order); | ||
2346 | unsigned long used = addr + PAGE_ALIGN(size); | ||
2347 | |||
2348 | split_page(virt_to_page((void *)addr), order); | ||
2349 | while (used < alloc_end) { | ||
2350 | free_page(used); | ||
2351 | used += PAGE_SIZE; | ||
2352 | } | ||
2353 | } | ||
2354 | return (void *)addr; | ||
2355 | } | ||
2356 | |||
2227 | /** | 2357 | /** |
2228 | * alloc_pages_exact - allocate an exact number physically-contiguous pages. | 2358 | * alloc_pages_exact - allocate an exact number physically-contiguous pages. |
2229 | * @size: the number of bytes to allocate | 2359 | * @size: the number of bytes to allocate |
@@ -2243,22 +2373,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) | |||
2243 | unsigned long addr; | 2373 | unsigned long addr; |
2244 | 2374 | ||
2245 | addr = __get_free_pages(gfp_mask, order); | 2375 | addr = __get_free_pages(gfp_mask, order); |
2246 | if (addr) { | 2376 | return make_alloc_exact(addr, order, size); |
2247 | unsigned long alloc_end = addr + (PAGE_SIZE << order); | ||
2248 | unsigned long used = addr + PAGE_ALIGN(size); | ||
2249 | |||
2250 | split_page(virt_to_page((void *)addr), order); | ||
2251 | while (used < alloc_end) { | ||
2252 | free_page(used); | ||
2253 | used += PAGE_SIZE; | ||
2254 | } | ||
2255 | } | ||
2256 | |||
2257 | return (void *)addr; | ||
2258 | } | 2377 | } |
2259 | EXPORT_SYMBOL(alloc_pages_exact); | 2378 | EXPORT_SYMBOL(alloc_pages_exact); |
2260 | 2379 | ||
2261 | /** | 2380 | /** |
2381 | * alloc_pages_exact_nid - allocate an exact number of physically-contiguous | ||
2382 | * pages on a node. | ||
2383 | * @nid: the preferred node ID where memory should be allocated | ||
2384 | * @size: the number of bytes to allocate | ||
2385 | * @gfp_mask: GFP flags for the allocation | ||
2386 | * | ||
2387 | * Like alloc_pages_exact(), but try to allocate on node nid first before falling | ||
2388 | * back. | ||
2389 | * Note this is not alloc_pages_exact_node() which allocates on a specific node, | ||
2390 | * but is not exact. | ||
2391 | */ | ||
2392 | void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) | ||
2393 | { | ||
2394 | unsigned order = get_order(size); | ||
2395 | struct page *p = alloc_pages_node(nid, gfp_mask, order); | ||
2396 | if (!p) | ||
2397 | return NULL; | ||
2398 | return make_alloc_exact((unsigned long)page_address(p), order, size); | ||
2399 | } | ||
2400 | EXPORT_SYMBOL(alloc_pages_exact_nid); | ||
2401 | |||
2402 | /** | ||
2262 | * free_pages_exact - release memory allocated via alloc_pages_exact() | 2403 | * free_pages_exact - release memory allocated via alloc_pages_exact() |
2263 | * @virt: the value returned by alloc_pages_exact. | 2404 | * @virt: the value returned by alloc_pages_exact. |
2264 | * @size: size of allocation, same value as passed to alloc_pages_exact(). | 2405 | * @size: size of allocation, same value as passed to alloc_pages_exact(). |
@@ -2352,19 +2493,41 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
2352 | } | 2493 | } |
2353 | #endif | 2494 | #endif |
2354 | 2495 | ||
2496 | /* | ||
2497 | * Determine whether the node should be displayed or not, depending on whether | ||
2498 | * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). | ||
2499 | */ | ||
2500 | bool skip_free_areas_node(unsigned int flags, int nid) | ||
2501 | { | ||
2502 | bool ret = false; | ||
2503 | |||
2504 | if (!(flags & SHOW_MEM_FILTER_NODES)) | ||
2505 | goto out; | ||
2506 | |||
2507 | get_mems_allowed(); | ||
2508 | ret = !node_isset(nid, cpuset_current_mems_allowed); | ||
2509 | put_mems_allowed(); | ||
2510 | out: | ||
2511 | return ret; | ||
2512 | } | ||
2513 | |||
2355 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 2514 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
2356 | 2515 | ||
2357 | /* | 2516 | /* |
2358 | * Show free area list (used inside shift_scroll-lock stuff) | 2517 | * Show free area list (used inside shift_scroll-lock stuff) |
2359 | * We also calculate the percentage fragmentation. We do this by counting the | 2518 | * We also calculate the percentage fragmentation. We do this by counting the |
2360 | * memory on each free list with the exception of the first item on the list. | 2519 | * memory on each free list with the exception of the first item on the list. |
2520 | * Suppresses nodes that are not allowed by current's cpuset if | ||
2521 | * SHOW_MEM_FILTER_NODES is passed. | ||
2361 | */ | 2522 | */ |
2362 | void show_free_areas(void) | 2523 | void show_free_areas(unsigned int filter) |
2363 | { | 2524 | { |
2364 | int cpu; | 2525 | int cpu; |
2365 | struct zone *zone; | 2526 | struct zone *zone; |
2366 | 2527 | ||
2367 | for_each_populated_zone(zone) { | 2528 | for_each_populated_zone(zone) { |
2529 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | ||
2530 | continue; | ||
2368 | show_node(zone); | 2531 | show_node(zone); |
2369 | printk("%s per-cpu:\n", zone->name); | 2532 | printk("%s per-cpu:\n", zone->name); |
2370 | 2533 | ||
@@ -2406,6 +2569,8 @@ void show_free_areas(void) | |||
2406 | for_each_populated_zone(zone) { | 2569 | for_each_populated_zone(zone) { |
2407 | int i; | 2570 | int i; |
2408 | 2571 | ||
2572 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | ||
2573 | continue; | ||
2409 | show_node(zone); | 2574 | show_node(zone); |
2410 | printk("%s" | 2575 | printk("%s" |
2411 | " free:%lukB" | 2576 | " free:%lukB" |
@@ -2436,7 +2601,7 @@ void show_free_areas(void) | |||
2436 | " all_unreclaimable? %s" | 2601 | " all_unreclaimable? %s" |
2437 | "\n", | 2602 | "\n", |
2438 | zone->name, | 2603 | zone->name, |
2439 | K(zone_nr_free_pages(zone)), | 2604 | K(zone_page_state(zone, NR_FREE_PAGES)), |
2440 | K(min_wmark_pages(zone)), | 2605 | K(min_wmark_pages(zone)), |
2441 | K(low_wmark_pages(zone)), | 2606 | K(low_wmark_pages(zone)), |
2442 | K(high_wmark_pages(zone)), | 2607 | K(high_wmark_pages(zone)), |
@@ -2473,6 +2638,8 @@ void show_free_areas(void) | |||
2473 | for_each_populated_zone(zone) { | 2638 | for_each_populated_zone(zone) { |
2474 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 2639 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
2475 | 2640 | ||
2641 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | ||
2642 | continue; | ||
2476 | show_node(zone); | 2643 | show_node(zone); |
2477 | printk("%s: ", zone->name); | 2644 | printk("%s: ", zone->name); |
2478 | 2645 | ||
@@ -2579,9 +2746,16 @@ static int __parse_numa_zonelist_order(char *s) | |||
2579 | 2746 | ||
2580 | static __init int setup_numa_zonelist_order(char *s) | 2747 | static __init int setup_numa_zonelist_order(char *s) |
2581 | { | 2748 | { |
2582 | if (s) | 2749 | int ret; |
2583 | return __parse_numa_zonelist_order(s); | 2750 | |
2584 | return 0; | 2751 | if (!s) |
2752 | return 0; | ||
2753 | |||
2754 | ret = __parse_numa_zonelist_order(s); | ||
2755 | if (ret == 0) | ||
2756 | strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); | ||
2757 | |||
2758 | return ret; | ||
2585 | } | 2759 | } |
2586 | early_param("numa_zonelist_order", setup_numa_zonelist_order); | 2760 | early_param("numa_zonelist_order", setup_numa_zonelist_order); |
2587 | 2761 | ||
@@ -3007,14 +3181,6 @@ static __init_refok int __build_all_zonelists(void *data) | |||
3007 | build_zonelist_cache(pgdat); | 3181 | build_zonelist_cache(pgdat); |
3008 | } | 3182 | } |
3009 | 3183 | ||
3010 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
3011 | /* Setup real pagesets for the new zone */ | ||
3012 | if (data) { | ||
3013 | struct zone *zone = data; | ||
3014 | setup_zone_pageset(zone); | ||
3015 | } | ||
3016 | #endif | ||
3017 | |||
3018 | /* | 3184 | /* |
3019 | * Initialize the boot_pagesets that are going to be used | 3185 | * Initialize the boot_pagesets that are going to be used |
3020 | * for bootstrapping processors. The real pagesets for | 3186 | * for bootstrapping processors. The real pagesets for |
@@ -3052,7 +3218,7 @@ static __init_refok int __build_all_zonelists(void *data) | |||
3052 | * Called with zonelists_mutex held always | 3218 | * Called with zonelists_mutex held always |
3053 | * unless system_state == SYSTEM_BOOTING. | 3219 | * unless system_state == SYSTEM_BOOTING. |
3054 | */ | 3220 | */ |
3055 | void build_all_zonelists(void *data) | 3221 | void __ref build_all_zonelists(void *data) |
3056 | { | 3222 | { |
3057 | set_zonelist_order(); | 3223 | set_zonelist_order(); |
3058 | 3224 | ||
@@ -3063,7 +3229,11 @@ void build_all_zonelists(void *data) | |||
3063 | } else { | 3229 | } else { |
3064 | /* we have to stop all cpus to guarantee there is no user | 3230 | /* we have to stop all cpus to guarantee there is no user |
3065 | of zonelist */ | 3231 | of zonelist */ |
3066 | stop_machine(__build_all_zonelists, data, NULL); | 3232 | #ifdef CONFIG_MEMORY_HOTPLUG |
3233 | if (data) | ||
3234 | setup_zone_pageset((struct zone *)data); | ||
3235 | #endif | ||
3236 | stop_machine(__build_all_zonelists, NULL, NULL); | ||
3067 | /* cpuset refresh routine should be here */ | 3237 | /* cpuset refresh routine should be here */ |
3068 | } | 3238 | } |
3069 | vm_total_pages = nr_free_pagecache_pages(); | 3239 | vm_total_pages = nr_free_pagecache_pages(); |
@@ -3159,6 +3329,20 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
3159 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 3329 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
3160 | 3330 | ||
3161 | /* | 3331 | /* |
3332 | * Check if a pageblock contains reserved pages | ||
3333 | */ | ||
3334 | static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) | ||
3335 | { | ||
3336 | unsigned long pfn; | ||
3337 | |||
3338 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | ||
3339 | if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) | ||
3340 | return 1; | ||
3341 | } | ||
3342 | return 0; | ||
3343 | } | ||
3344 | |||
3345 | /* | ||
3162 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number | 3346 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number |
3163 | * of blocks reserved is based on min_wmark_pages(zone). The memory within | 3347 | * of blocks reserved is based on min_wmark_pages(zone). The memory within |
3164 | * the reserve will tend to store contiguous free pages. Setting min_free_kbytes | 3348 | * the reserve will tend to store contiguous free pages. Setting min_free_kbytes |
@@ -3167,7 +3351,7 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
3167 | */ | 3351 | */ |
3168 | static void setup_zone_migrate_reserve(struct zone *zone) | 3352 | static void setup_zone_migrate_reserve(struct zone *zone) |
3169 | { | 3353 | { |
3170 | unsigned long start_pfn, pfn, end_pfn; | 3354 | unsigned long start_pfn, pfn, end_pfn, block_end_pfn; |
3171 | struct page *page; | 3355 | struct page *page; |
3172 | unsigned long block_migratetype; | 3356 | unsigned long block_migratetype; |
3173 | int reserve; | 3357 | int reserve; |
@@ -3197,7 +3381,8 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3197 | continue; | 3381 | continue; |
3198 | 3382 | ||
3199 | /* Blocks with reserved pages will never free, skip them. */ | 3383 | /* Blocks with reserved pages will never free, skip them. */ |
3200 | if (PageReserved(page)) | 3384 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); |
3385 | if (pageblock_is_reserved(pfn, block_end_pfn)) | ||
3201 | continue; | 3386 | continue; |
3202 | 3387 | ||
3203 | block_migratetype = get_pageblock_migratetype(page); | 3388 | block_migratetype = get_pageblock_migratetype(page); |
@@ -3386,7 +3571,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
3386 | pcp->batch = PAGE_SHIFT * 8; | 3571 | pcp->batch = PAGE_SHIFT * 8; |
3387 | } | 3572 | } |
3388 | 3573 | ||
3389 | static __meminit void setup_zone_pageset(struct zone *zone) | 3574 | static void setup_zone_pageset(struct zone *zone) |
3390 | { | 3575 | { |
3391 | int cpu; | 3576 | int cpu; |
3392 | 3577 | ||
@@ -3436,7 +3621,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
3436 | 3621 | ||
3437 | if (!slab_is_available()) { | 3622 | if (!slab_is_available()) { |
3438 | zone->wait_table = (wait_queue_head_t *) | 3623 | zone->wait_table = (wait_queue_head_t *) |
3439 | alloc_bootmem_node(pgdat, alloc_size); | 3624 | alloc_bootmem_node_nopanic(pgdat, alloc_size); |
3440 | } else { | 3625 | } else { |
3441 | /* | 3626 | /* |
3442 | * This case means that a zone whose size was 0 gets new memory | 3627 | * This case means that a zone whose size was 0 gets new memory |
@@ -3636,68 +3821,87 @@ void __init free_bootmem_with_active_regions(int nid, | |||
3636 | } | 3821 | } |
3637 | } | 3822 | } |
3638 | 3823 | ||
3639 | int __init add_from_early_node_map(struct range *range, int az, | 3824 | #ifdef CONFIG_HAVE_MEMBLOCK |
3640 | int nr_range, int nid) | 3825 | /* |
3826 | * Basic iterator support. Return the last range of PFNs for a node | ||
3827 | * Note: nid == MAX_NUMNODES returns last region regardless of node | ||
3828 | */ | ||
3829 | static int __meminit last_active_region_index_in_nid(int nid) | ||
3641 | { | 3830 | { |
3642 | int i; | 3831 | int i; |
3643 | u64 start, end; | ||
3644 | 3832 | ||
3645 | /* need to go over early_node_map to find out good range for node */ | 3833 | for (i = nr_nodemap_entries - 1; i >= 0; i--) |
3646 | for_each_active_range_index_in_nid(i, nid) { | 3834 | if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) |
3647 | start = early_node_map[i].start_pfn; | 3835 | return i; |
3648 | end = early_node_map[i].end_pfn; | 3836 | |
3649 | nr_range = add_range(range, az, nr_range, start, end); | 3837 | return -1; |
3650 | } | ||
3651 | return nr_range; | ||
3652 | } | 3838 | } |
3653 | 3839 | ||
3654 | #ifdef CONFIG_NO_BOOTMEM | 3840 | /* |
3655 | void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | 3841 | * Basic iterator support. Return the previous active range of PFNs for a node |
3842 | * Note: nid == MAX_NUMNODES returns next region regardless of node | ||
3843 | */ | ||
3844 | static int __meminit previous_active_region_index_in_nid(int index, int nid) | ||
3845 | { | ||
3846 | for (index = index - 1; index >= 0; index--) | ||
3847 | if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) | ||
3848 | return index; | ||
3849 | |||
3850 | return -1; | ||
3851 | } | ||
3852 | |||
3853 | #define for_each_active_range_index_in_nid_reverse(i, nid) \ | ||
3854 | for (i = last_active_region_index_in_nid(nid); i != -1; \ | ||
3855 | i = previous_active_region_index_in_nid(i, nid)) | ||
3856 | |||
3857 | u64 __init find_memory_core_early(int nid, u64 size, u64 align, | ||
3656 | u64 goal, u64 limit) | 3858 | u64 goal, u64 limit) |
3657 | { | 3859 | { |
3658 | int i; | 3860 | int i; |
3659 | void *ptr; | ||
3660 | |||
3661 | if (limit > get_max_mapped()) | ||
3662 | limit = get_max_mapped(); | ||
3663 | 3861 | ||
3664 | /* need to go over early_node_map to find out good range for node */ | 3862 | /* Need to go over early_node_map to find out good range for node */ |
3665 | for_each_active_range_index_in_nid(i, nid) { | 3863 | for_each_active_range_index_in_nid_reverse(i, nid) { |
3666 | u64 addr; | 3864 | u64 addr; |
3667 | u64 ei_start, ei_last; | 3865 | u64 ei_start, ei_last; |
3866 | u64 final_start, final_end; | ||
3668 | 3867 | ||
3669 | ei_last = early_node_map[i].end_pfn; | 3868 | ei_last = early_node_map[i].end_pfn; |
3670 | ei_last <<= PAGE_SHIFT; | 3869 | ei_last <<= PAGE_SHIFT; |
3671 | ei_start = early_node_map[i].start_pfn; | 3870 | ei_start = early_node_map[i].start_pfn; |
3672 | ei_start <<= PAGE_SHIFT; | 3871 | ei_start <<= PAGE_SHIFT; |
3673 | addr = find_early_area(ei_start, ei_last, | ||
3674 | goal, limit, size, align); | ||
3675 | 3872 | ||
3676 | if (addr == -1ULL) | 3873 | final_start = max(ei_start, goal); |
3874 | final_end = min(ei_last, limit); | ||
3875 | |||
3876 | if (final_start >= final_end) | ||
3677 | continue; | 3877 | continue; |
3678 | 3878 | ||
3679 | #if 0 | 3879 | addr = memblock_find_in_range(final_start, final_end, size, align); |
3680 | printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n", | ||
3681 | nid, | ||
3682 | ei_start, ei_last, goal, limit, size, | ||
3683 | align, addr); | ||
3684 | #endif | ||
3685 | 3880 | ||
3686 | ptr = phys_to_virt(addr); | 3881 | if (addr == MEMBLOCK_ERROR) |
3687 | memset(ptr, 0, size); | 3882 | continue; |
3688 | reserve_early_without_check(addr, addr + size, "BOOTMEM"); | 3883 | |
3689 | /* | 3884 | return addr; |
3690 | * The min_count is set to 0 so that bootmem allocated blocks | ||
3691 | * are never reported as leaks. | ||
3692 | */ | ||
3693 | kmemleak_alloc(ptr, size, 0, 0); | ||
3694 | return ptr; | ||
3695 | } | 3885 | } |
3696 | 3886 | ||
3697 | return NULL; | 3887 | return MEMBLOCK_ERROR; |
3698 | } | 3888 | } |
3699 | #endif | 3889 | #endif |
3700 | 3890 | ||
3891 | int __init add_from_early_node_map(struct range *range, int az, | ||
3892 | int nr_range, int nid) | ||
3893 | { | ||
3894 | int i; | ||
3895 | u64 start, end; | ||
3896 | |||
3897 | /* need to go over early_node_map to find out good range for node */ | ||
3898 | for_each_active_range_index_in_nid(i, nid) { | ||
3899 | start = early_node_map[i].start_pfn; | ||
3900 | end = early_node_map[i].end_pfn; | ||
3901 | nr_range = add_range(range, az, nr_range, start, end); | ||
3902 | } | ||
3903 | return nr_range; | ||
3904 | } | ||
3701 | 3905 | ||
3702 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) | 3906 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) |
3703 | { | 3907 | { |
@@ -3779,7 +3983,7 @@ static void __init find_usable_zone_for_movable(void) | |||
3779 | 3983 | ||
3780 | /* | 3984 | /* |
3781 | * The zone ranges provided by the architecture do not include ZONE_MOVABLE | 3985 | * The zone ranges provided by the architecture do not include ZONE_MOVABLE |
3782 | * because it is sized independant of architecture. Unlike the other zones, | 3986 | * because it is sized independent of architecture. Unlike the other zones, |
3783 | * the starting point for ZONE_MOVABLE is not fixed. It may be different | 3987 | * the starting point for ZONE_MOVABLE is not fixed. It may be different |
3784 | * in each node depending on the size of each node and how evenly kernelcore | 3988 | * in each node depending on the size of each node and how evenly kernelcore |
3785 | * is distributed. This helper function adjusts the zone ranges | 3989 | * is distributed. This helper function adjusts the zone ranges |
@@ -3994,10 +4198,11 @@ static void __init setup_usemap(struct pglist_data *pgdat, | |||
3994 | unsigned long usemapsize = usemap_size(zonesize); | 4198 | unsigned long usemapsize = usemap_size(zonesize); |
3995 | zone->pageblock_flags = NULL; | 4199 | zone->pageblock_flags = NULL; |
3996 | if (usemapsize) | 4200 | if (usemapsize) |
3997 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); | 4201 | zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, |
4202 | usemapsize); | ||
3998 | } | 4203 | } |
3999 | #else | 4204 | #else |
4000 | static void inline setup_usemap(struct pglist_data *pgdat, | 4205 | static inline void setup_usemap(struct pglist_data *pgdat, |
4001 | struct zone *zone, unsigned long zonesize) {} | 4206 | struct zone *zone, unsigned long zonesize) {} |
4002 | #endif /* CONFIG_SPARSEMEM */ | 4207 | #endif /* CONFIG_SPARSEMEM */ |
4003 | 4208 | ||
@@ -4114,10 +4319,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4114 | zone->zone_pgdat = pgdat; | 4319 | zone->zone_pgdat = pgdat; |
4115 | 4320 | ||
4116 | zone_pcp_init(zone); | 4321 | zone_pcp_init(zone); |
4117 | for_each_lru(l) { | 4322 | for_each_lru(l) |
4118 | INIT_LIST_HEAD(&zone->lru[l].list); | 4323 | INIT_LIST_HEAD(&zone->lru[l].list); |
4119 | zone->reclaim_stat.nr_saved_scan[l] = 0; | ||
4120 | } | ||
4121 | zone->reclaim_stat.recent_rotated[0] = 0; | 4324 | zone->reclaim_stat.recent_rotated[0] = 0; |
4122 | zone->reclaim_stat.recent_rotated[1] = 0; | 4325 | zone->reclaim_stat.recent_rotated[1] = 0; |
4123 | zone->reclaim_stat.recent_scanned[0] = 0; | 4326 | zone->reclaim_stat.recent_scanned[0] = 0; |
@@ -4160,7 +4363,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
4160 | size = (end - start) * sizeof(struct page); | 4363 | size = (end - start) * sizeof(struct page); |
4161 | map = alloc_remap(pgdat->node_id, size); | 4364 | map = alloc_remap(pgdat->node_id, size); |
4162 | if (!map) | 4365 | if (!map) |
4163 | map = alloc_bootmem_node(pgdat, size); | 4366 | map = alloc_bootmem_node_nopanic(pgdat, size); |
4164 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); | 4367 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); |
4165 | } | 4368 | } |
4166 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4369 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
@@ -4732,15 +4935,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) | |||
4732 | dma_reserve = new_dma_reserve; | 4935 | dma_reserve = new_dma_reserve; |
4733 | } | 4936 | } |
4734 | 4937 | ||
4735 | #ifndef CONFIG_NEED_MULTIPLE_NODES | ||
4736 | struct pglist_data __refdata contig_page_data = { | ||
4737 | #ifndef CONFIG_NO_BOOTMEM | ||
4738 | .bdata = &bootmem_node_data[0] | ||
4739 | #endif | ||
4740 | }; | ||
4741 | EXPORT_SYMBOL(contig_page_data); | ||
4742 | #endif | ||
4743 | |||
4744 | void __init free_area_init(unsigned long *zones_size) | 4938 | void __init free_area_init(unsigned long *zones_size) |
4745 | { | 4939 | { |
4746 | free_area_init_node(0, zones_size, | 4940 | free_area_init_node(0, zones_size, |
@@ -4934,7 +5128,7 @@ void setup_per_zone_wmarks(void) | |||
4934 | * 1TB 101 10GB | 5128 | * 1TB 101 10GB |
4935 | * 10TB 320 32GB | 5129 | * 10TB 320 32GB |
4936 | */ | 5130 | */ |
4937 | void calculate_zone_inactive_ratio(struct zone *zone) | 5131 | static void __meminit calculate_zone_inactive_ratio(struct zone *zone) |
4938 | { | 5132 | { |
4939 | unsigned int gb, ratio; | 5133 | unsigned int gb, ratio; |
4940 | 5134 | ||
@@ -4948,7 +5142,7 @@ void calculate_zone_inactive_ratio(struct zone *zone) | |||
4948 | zone->inactive_ratio = ratio; | 5142 | zone->inactive_ratio = ratio; |
4949 | } | 5143 | } |
4950 | 5144 | ||
4951 | static void __init setup_per_zone_inactive_ratio(void) | 5145 | static void __meminit setup_per_zone_inactive_ratio(void) |
4952 | { | 5146 | { |
4953 | struct zone *zone; | 5147 | struct zone *zone; |
4954 | 5148 | ||
@@ -4980,7 +5174,7 @@ static void __init setup_per_zone_inactive_ratio(void) | |||
4980 | * 8192MB: 11584k | 5174 | * 8192MB: 11584k |
4981 | * 16384MB: 16384k | 5175 | * 16384MB: 16384k |
4982 | */ | 5176 | */ |
4983 | static int __init init_per_zone_wmark_min(void) | 5177 | int __meminit init_per_zone_wmark_min(void) |
4984 | { | 5178 | { |
4985 | unsigned long lowmem_kbytes; | 5179 | unsigned long lowmem_kbytes; |
4986 | 5180 | ||
@@ -4992,6 +5186,7 @@ static int __init init_per_zone_wmark_min(void) | |||
4992 | if (min_free_kbytes > 65536) | 5186 | if (min_free_kbytes > 65536) |
4993 | min_free_kbytes = 65536; | 5187 | min_free_kbytes = 65536; |
4994 | setup_per_zone_wmarks(); | 5188 | setup_per_zone_wmarks(); |
5189 | refresh_zone_stat_thresholds(); | ||
4995 | setup_per_zone_lowmem_reserve(); | 5190 | setup_per_zone_lowmem_reserve(); |
4996 | setup_per_zone_inactive_ratio(); | 5191 | setup_per_zone_inactive_ratio(); |
4997 | return 0; | 5192 | return 0; |
@@ -5281,26 +5476,71 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5281 | * page allocater never alloc memory from ISOLATE block. | 5476 | * page allocater never alloc memory from ISOLATE block. |
5282 | */ | 5477 | */ |
5283 | 5478 | ||
5479 | static int | ||
5480 | __count_immobile_pages(struct zone *zone, struct page *page, int count) | ||
5481 | { | ||
5482 | unsigned long pfn, iter, found; | ||
5483 | /* | ||
5484 | * For avoiding noise data, lru_add_drain_all() should be called | ||
5485 | * If ZONE_MOVABLE, the zone never contains immobile pages | ||
5486 | */ | ||
5487 | if (zone_idx(zone) == ZONE_MOVABLE) | ||
5488 | return true; | ||
5489 | |||
5490 | if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) | ||
5491 | return true; | ||
5492 | |||
5493 | pfn = page_to_pfn(page); | ||
5494 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { | ||
5495 | unsigned long check = pfn + iter; | ||
5496 | |||
5497 | if (!pfn_valid_within(check)) | ||
5498 | continue; | ||
5499 | |||
5500 | page = pfn_to_page(check); | ||
5501 | if (!page_count(page)) { | ||
5502 | if (PageBuddy(page)) | ||
5503 | iter += (1 << page_order(page)) - 1; | ||
5504 | continue; | ||
5505 | } | ||
5506 | if (!PageLRU(page)) | ||
5507 | found++; | ||
5508 | /* | ||
5509 | * If there are RECLAIMABLE pages, we need to check it. | ||
5510 | * But now, memory offline itself doesn't call shrink_slab() | ||
5511 | * and it still to be fixed. | ||
5512 | */ | ||
5513 | /* | ||
5514 | * If the page is not RAM, page_count()should be 0. | ||
5515 | * we don't need more check. This is an _used_ not-movable page. | ||
5516 | * | ||
5517 | * The problematic thing here is PG_reserved pages. PG_reserved | ||
5518 | * is set to both of a memory hole page and a _used_ kernel | ||
5519 | * page at boot. | ||
5520 | */ | ||
5521 | if (found > count) | ||
5522 | return false; | ||
5523 | } | ||
5524 | return true; | ||
5525 | } | ||
5526 | |||
5527 | bool is_pageblock_removable_nolock(struct page *page) | ||
5528 | { | ||
5529 | struct zone *zone = page_zone(page); | ||
5530 | return __count_immobile_pages(zone, page, 0); | ||
5531 | } | ||
5532 | |||
5284 | int set_migratetype_isolate(struct page *page) | 5533 | int set_migratetype_isolate(struct page *page) |
5285 | { | 5534 | { |
5286 | struct zone *zone; | 5535 | struct zone *zone; |
5287 | struct page *curr_page; | 5536 | unsigned long flags, pfn; |
5288 | unsigned long flags, pfn, iter; | ||
5289 | unsigned long immobile = 0; | ||
5290 | struct memory_isolate_notify arg; | 5537 | struct memory_isolate_notify arg; |
5291 | int notifier_ret; | 5538 | int notifier_ret; |
5292 | int ret = -EBUSY; | 5539 | int ret = -EBUSY; |
5293 | int zone_idx; | ||
5294 | 5540 | ||
5295 | zone = page_zone(page); | 5541 | zone = page_zone(page); |
5296 | zone_idx = zone_idx(zone); | ||
5297 | 5542 | ||
5298 | spin_lock_irqsave(&zone->lock, flags); | 5543 | spin_lock_irqsave(&zone->lock, flags); |
5299 | if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || | ||
5300 | zone_idx == ZONE_MOVABLE) { | ||
5301 | ret = 0; | ||
5302 | goto out; | ||
5303 | } | ||
5304 | 5544 | ||
5305 | pfn = page_to_pfn(page); | 5545 | pfn = page_to_pfn(page); |
5306 | arg.start_pfn = pfn; | 5546 | arg.start_pfn = pfn; |
@@ -5320,23 +5560,20 @@ int set_migratetype_isolate(struct page *page) | |||
5320 | */ | 5560 | */ |
5321 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); | 5561 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); |
5322 | notifier_ret = notifier_to_errno(notifier_ret); | 5562 | notifier_ret = notifier_to_errno(notifier_ret); |
5323 | if (notifier_ret || !arg.pages_found) | 5563 | if (notifier_ret) |
5324 | goto out; | 5564 | goto out; |
5325 | 5565 | /* | |
5326 | for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { | 5566 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. |
5327 | if (!pfn_valid_within(pfn)) | 5567 | * We just check MOVABLE pages. |
5328 | continue; | 5568 | */ |
5329 | 5569 | if (__count_immobile_pages(zone, page, arg.pages_found)) | |
5330 | curr_page = pfn_to_page(iter); | ||
5331 | if (!page_count(curr_page) || PageLRU(curr_page)) | ||
5332 | continue; | ||
5333 | |||
5334 | immobile++; | ||
5335 | } | ||
5336 | |||
5337 | if (arg.pages_found == immobile) | ||
5338 | ret = 0; | 5570 | ret = 0; |
5339 | 5571 | ||
5572 | /* | ||
5573 | * immobile means "not-on-lru" paes. If immobile is larger than | ||
5574 | * removable-by-driver pages reported by notifier, we'll fail. | ||
5575 | */ | ||
5576 | |||
5340 | out: | 5577 | out: |
5341 | if (!ret) { | 5578 | if (!ret) { |
5342 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | 5579 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); |
@@ -5455,7 +5692,6 @@ static struct trace_print_flags pageflag_names[] = { | |||
5455 | {1UL << PG_swapcache, "swapcache" }, | 5692 | {1UL << PG_swapcache, "swapcache" }, |
5456 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | 5693 | {1UL << PG_mappedtodisk, "mappedtodisk" }, |
5457 | {1UL << PG_reclaim, "reclaim" }, | 5694 | {1UL << PG_reclaim, "reclaim" }, |
5458 | {1UL << PG_buddy, "buddy" }, | ||
5459 | {1UL << PG_swapbacked, "swapbacked" }, | 5695 | {1UL << PG_swapbacked, "swapbacked" }, |
5460 | {1UL << PG_unevictable, "unevictable" }, | 5696 | {1UL << PG_unevictable, "unevictable" }, |
5461 | #ifdef CONFIG_MMU | 5697 | #ifdef CONFIG_MMU |
@@ -5503,7 +5739,8 @@ void dump_page(struct page *page) | |||
5503 | { | 5739 | { |
5504 | printk(KERN_ALERT | 5740 | printk(KERN_ALERT |
5505 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | 5741 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", |
5506 | page, page_count(page), page_mapcount(page), | 5742 | page, atomic_read(&page->_count), page_mapcount(page), |
5507 | page->mapping, page->index); | 5743 | page->mapping, page->index); |
5508 | dump_page_flags(page->flags); | 5744 | dump_page_flags(page->flags); |
5745 | mem_cgroup_print_bad_page(page); | ||
5509 | } | 5746 | } |