diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-31 22:25:39 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-31 22:25:39 -0400 |
commit | ac694dbdbc403c00e2c14d10bc7b8412cc378259 (patch) | |
tree | e37328cfbeaf43716dd5914cad9179e57e84df76 /mm | |
parent | a40a1d3d0a2fd613fdec6d89d3c053268ced76ed (diff) | |
parent | 437ea90cc3afdca5229b41c6b1d38c4842756cb9 (diff) |
Merge branch 'akpm' (Andrew's patch-bomb)
Merge Andrew's second set of patches:
- MM
- a few random fixes
- a couple of RTC leftovers
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (120 commits)
rtc/rtc-88pm80x: remove unneed devm_kfree
rtc/rtc-88pm80x: assign ret only when rtc_register_driver fails
mm: hugetlbfs: close race during teardown of hugetlbfs shared page tables
tmpfs: distribute interleave better across nodes
mm: remove redundant initialization
mm: warn if pg_data_t isn't initialized with zero
mips: zero out pg_data_t when it's allocated
memcg: gix memory accounting scalability in shrink_page_list
mm/sparse: remove index_init_lock
mm/sparse: more checks on mem_section number
mm/sparse: optimize sparse_index_alloc
memcg: add mem_cgroup_from_css() helper
memcg: further prevent OOM with too many dirty pages
memcg: prevent OOM with too many dirty pages
mm: mmu_notifier: fix freed page still mapped in secondary MMU
mm: memcg: only check anon swapin page charges for swap cache
mm: memcg: only check swap cache pages for repeated charging
mm: memcg: split swapin charge function into private and public part
mm: memcg: remove needless !mm fixup to init_mm when charging
mm: memcg: remove unneeded shmem charge type
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 5 | ||||
-rw-r--r-- | mm/Makefile | 8 | ||||
-rw-r--r-- | mm/backing-dev.c | 20 | ||||
-rw-r--r-- | mm/compaction.c | 63 | ||||
-rw-r--r-- | mm/fadvise.c | 18 | ||||
-rw-r--r-- | mm/highmem.c | 12 | ||||
-rw-r--r-- | mm/hugetlb.c | 195 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 418 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 8 | ||||
-rw-r--r-- | mm/memblock.c | 35 | ||||
-rw-r--r-- | mm/memcontrol.c | 390 | ||||
-rw-r--r-- | mm/memory-failure.c | 17 | ||||
-rw-r--r-- | mm/memory.c | 9 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 20 | ||||
-rw-r--r-- | mm/migrate.c | 81 | ||||
-rw-r--r-- | mm/mmap.c | 5 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 45 | ||||
-rw-r--r-- | mm/mmzone.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 223 | ||||
-rw-r--r-- | mm/page_alloc.c | 318 | ||||
-rw-r--r-- | mm/page_cgroup.c | 2 | ||||
-rw-r--r-- | mm/page_io.c | 145 | ||||
-rw-r--r-- | mm/page_isolation.c | 93 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/slab.c | 216 | ||||
-rw-r--r-- | mm/slub.c | 30 | ||||
-rw-r--r-- | mm/sparse.c | 29 | ||||
-rw-r--r-- | mm/swap.c | 52 | ||||
-rw-r--r-- | mm/swap_state.c | 7 | ||||
-rw-r--r-- | mm/swapfile.c | 145 | ||||
-rw-r--r-- | mm/vmalloc.c | 16 | ||||
-rw-r--r-- | mm/vmscan.c | 175 | ||||
-rw-r--r-- | mm/vmstat.c | 1 |
35 files changed, 2043 insertions, 770 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 82fed4eb2b6f..d5c8019c6627 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK | |||
140 | config NO_BOOTMEM | 140 | config NO_BOOTMEM |
141 | boolean | 141 | boolean |
142 | 142 | ||
143 | config MEMORY_ISOLATION | ||
144 | boolean | ||
145 | |||
143 | # eventually, we can have this option just 'select SPARSEMEM' | 146 | # eventually, we can have this option just 'select SPARSEMEM' |
144 | config MEMORY_HOTPLUG | 147 | config MEMORY_HOTPLUG |
145 | bool "Allow for memory hot-add" | 148 | bool "Allow for memory hot-add" |
149 | select MEMORY_ISOLATION | ||
146 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 150 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
147 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG | 151 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG |
148 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) | 152 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) |
@@ -272,6 +276,7 @@ config MEMORY_FAILURE | |||
272 | depends on MMU | 276 | depends on MMU |
273 | depends on ARCH_SUPPORTS_MEMORY_FAILURE | 277 | depends on ARCH_SUPPORTS_MEMORY_FAILURE |
274 | bool "Enable recovery from hardware memory errors" | 278 | bool "Enable recovery from hardware memory errors" |
279 | select MEMORY_ISOLATION | ||
275 | help | 280 | help |
276 | Enables code to recover from some memory failures on systems | 281 | Enables code to recover from some memory failures on systems |
277 | with MCA recovery. This allows a system to continue running | 282 | with MCA recovery. This allows a system to continue running |
diff --git a/mm/Makefile b/mm/Makefile index 8e81fe263c94..92753e2d82da 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -15,8 +15,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | |||
15 | maccess.o page_alloc.o page-writeback.o \ | 15 | maccess.o page_alloc.o page-writeback.o \ |
16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
17 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 17 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
18 | page_isolation.o mm_init.o mmu_context.o percpu.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
19 | compaction.o slab_common.o $(mmu-y) | 19 | compaction.o $(mmu-y) |
20 | 20 | ||
21 | obj-y += init-mm.o | 21 | obj-y += init-mm.o |
22 | 22 | ||
@@ -49,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o | |||
49 | obj-$(CONFIG_MIGRATION) += migrate.o | 49 | obj-$(CONFIG_MIGRATION) += migrate.o |
50 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 50 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
51 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | 51 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o |
52 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 52 | obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o |
53 | obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o | ||
53 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 54 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
54 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 55 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
55 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | 56 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o |
56 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | 57 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o |
57 | obj-$(CONFIG_CLEANCACHE) += cleancache.o | 58 | obj-$(CONFIG_CLEANCACHE) += cleancache.o |
59 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 3387aea11209..6b4718e2ee34 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -886,3 +886,23 @@ out: | |||
886 | return ret; | 886 | return ret; |
887 | } | 887 | } |
888 | EXPORT_SYMBOL(wait_iff_congested); | 888 | EXPORT_SYMBOL(wait_iff_congested); |
889 | |||
890 | int pdflush_proc_obsolete(struct ctl_table *table, int write, | ||
891 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
892 | { | ||
893 | char kbuf[] = "0\n"; | ||
894 | |||
895 | if (*ppos) { | ||
896 | *lenp = 0; | ||
897 | return 0; | ||
898 | } | ||
899 | |||
900 | if (copy_to_user(buffer, kbuf, sizeof(kbuf))) | ||
901 | return -EFAULT; | ||
902 | printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n", | ||
903 | table->procname); | ||
904 | |||
905 | *lenp = 2; | ||
906 | *ppos += *lenp; | ||
907 | return 2; | ||
908 | } | ||
diff --git a/mm/compaction.c b/mm/compaction.c index 2f42d9528539..e78cb9688421 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -422,6 +422,17 @@ static void isolate_freepages(struct zone *zone, | |||
422 | pfn -= pageblock_nr_pages) { | 422 | pfn -= pageblock_nr_pages) { |
423 | unsigned long isolated; | 423 | unsigned long isolated; |
424 | 424 | ||
425 | /* | ||
426 | * Skip ahead if another thread is compacting in the area | ||
427 | * simultaneously. If we wrapped around, we can only skip | ||
428 | * ahead if zone->compact_cached_free_pfn also wrapped to | ||
429 | * above our starting point. | ||
430 | */ | ||
431 | if (cc->order > 0 && (!cc->wrapped || | ||
432 | zone->compact_cached_free_pfn > | ||
433 | cc->start_free_pfn)) | ||
434 | pfn = min(pfn, zone->compact_cached_free_pfn); | ||
435 | |||
425 | if (!pfn_valid(pfn)) | 436 | if (!pfn_valid(pfn)) |
426 | continue; | 437 | continue; |
427 | 438 | ||
@@ -461,8 +472,11 @@ static void isolate_freepages(struct zone *zone, | |||
461 | * looking for free pages, the search will restart here as | 472 | * looking for free pages, the search will restart here as |
462 | * page migration may have returned some pages to the allocator | 473 | * page migration may have returned some pages to the allocator |
463 | */ | 474 | */ |
464 | if (isolated) | 475 | if (isolated) { |
465 | high_pfn = max(high_pfn, pfn); | 476 | high_pfn = max(high_pfn, pfn); |
477 | if (cc->order > 0) | ||
478 | zone->compact_cached_free_pfn = high_pfn; | ||
479 | } | ||
466 | } | 480 | } |
467 | 481 | ||
468 | /* split_free_page does not map the pages */ | 482 | /* split_free_page does not map the pages */ |
@@ -556,6 +570,20 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
556 | return ISOLATE_SUCCESS; | 570 | return ISOLATE_SUCCESS; |
557 | } | 571 | } |
558 | 572 | ||
573 | /* | ||
574 | * Returns the start pfn of the last page block in a zone. This is the starting | ||
575 | * point for full compaction of a zone. Compaction searches for free pages from | ||
576 | * the end of each zone, while isolate_freepages_block scans forward inside each | ||
577 | * page block. | ||
578 | */ | ||
579 | static unsigned long start_free_pfn(struct zone *zone) | ||
580 | { | ||
581 | unsigned long free_pfn; | ||
582 | free_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
583 | free_pfn &= ~(pageblock_nr_pages-1); | ||
584 | return free_pfn; | ||
585 | } | ||
586 | |||
559 | static int compact_finished(struct zone *zone, | 587 | static int compact_finished(struct zone *zone, |
560 | struct compact_control *cc) | 588 | struct compact_control *cc) |
561 | { | 589 | { |
@@ -565,8 +593,26 @@ static int compact_finished(struct zone *zone, | |||
565 | if (fatal_signal_pending(current)) | 593 | if (fatal_signal_pending(current)) |
566 | return COMPACT_PARTIAL; | 594 | return COMPACT_PARTIAL; |
567 | 595 | ||
568 | /* Compaction run completes if the migrate and free scanner meet */ | 596 | /* |
569 | if (cc->free_pfn <= cc->migrate_pfn) | 597 | * A full (order == -1) compaction run starts at the beginning and |
598 | * end of a zone; it completes when the migrate and free scanner meet. | ||
599 | * A partial (order > 0) compaction can start with the free scanner | ||
600 | * at a random point in the zone, and may have to restart. | ||
601 | */ | ||
602 | if (cc->free_pfn <= cc->migrate_pfn) { | ||
603 | if (cc->order > 0 && !cc->wrapped) { | ||
604 | /* We started partway through; restart at the end. */ | ||
605 | unsigned long free_pfn = start_free_pfn(zone); | ||
606 | zone->compact_cached_free_pfn = free_pfn; | ||
607 | cc->free_pfn = free_pfn; | ||
608 | cc->wrapped = 1; | ||
609 | return COMPACT_CONTINUE; | ||
610 | } | ||
611 | return COMPACT_COMPLETE; | ||
612 | } | ||
613 | |||
614 | /* We wrapped around and ended up where we started. */ | ||
615 | if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn) | ||
570 | return COMPACT_COMPLETE; | 616 | return COMPACT_COMPLETE; |
571 | 617 | ||
572 | /* | 618 | /* |
@@ -664,8 +710,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
664 | 710 | ||
665 | /* Setup to move all movable pages to the end of the zone */ | 711 | /* Setup to move all movable pages to the end of the zone */ |
666 | cc->migrate_pfn = zone->zone_start_pfn; | 712 | cc->migrate_pfn = zone->zone_start_pfn; |
667 | cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; | 713 | |
668 | cc->free_pfn &= ~(pageblock_nr_pages-1); | 714 | if (cc->order > 0) { |
715 | /* Incremental compaction. Start where the last one stopped. */ | ||
716 | cc->free_pfn = zone->compact_cached_free_pfn; | ||
717 | cc->start_free_pfn = cc->free_pfn; | ||
718 | } else { | ||
719 | /* Order == -1 starts at the end of the zone. */ | ||
720 | cc->free_pfn = start_free_pfn(zone); | ||
721 | } | ||
669 | 722 | ||
670 | migrate_prep_local(); | 723 | migrate_prep_local(); |
671 | 724 | ||
diff --git a/mm/fadvise.c b/mm/fadvise.c index 469491e0af79..9b75a045dbf4 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
93 | spin_unlock(&file->f_lock); | 93 | spin_unlock(&file->f_lock); |
94 | break; | 94 | break; |
95 | case POSIX_FADV_WILLNEED: | 95 | case POSIX_FADV_WILLNEED: |
96 | if (!mapping->a_ops->readpage) { | ||
97 | ret = -EINVAL; | ||
98 | break; | ||
99 | } | ||
100 | |||
101 | /* First and last PARTIAL page! */ | 96 | /* First and last PARTIAL page! */ |
102 | start_index = offset >> PAGE_CACHE_SHIFT; | 97 | start_index = offset >> PAGE_CACHE_SHIFT; |
103 | end_index = endbyte >> PAGE_CACHE_SHIFT; | 98 | end_index = endbyte >> PAGE_CACHE_SHIFT; |
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
106 | nrpages = end_index - start_index + 1; | 101 | nrpages = end_index - start_index + 1; |
107 | if (!nrpages) | 102 | if (!nrpages) |
108 | nrpages = ~0UL; | 103 | nrpages = ~0UL; |
109 | 104 | ||
110 | ret = force_page_cache_readahead(mapping, file, | 105 | /* |
111 | start_index, | 106 | * Ignore return value because fadvise() shall return |
112 | nrpages); | 107 | * success even if filesystem can't retrieve a hint, |
113 | if (ret > 0) | 108 | */ |
114 | ret = 0; | 109 | force_page_cache_readahead(mapping, file, start_index, |
110 | nrpages); | ||
115 | break; | 111 | break; |
116 | case POSIX_FADV_NOREUSE: | 112 | case POSIX_FADV_NOREUSE: |
117 | break; | 113 | break; |
diff --git a/mm/highmem.c b/mm/highmem.c index 57d82c6250c3..d517cd16a6eb 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); | |||
94 | do { spin_unlock(&kmap_lock); (void)(flags); } while (0) | 94 | do { spin_unlock(&kmap_lock); (void)(flags); } while (0) |
95 | #endif | 95 | #endif |
96 | 96 | ||
97 | struct page *kmap_to_page(void *vaddr) | ||
98 | { | ||
99 | unsigned long addr = (unsigned long)vaddr; | ||
100 | |||
101 | if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) { | ||
102 | int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; | ||
103 | return pte_page(pkmap_page_table[i]); | ||
104 | } | ||
105 | |||
106 | return virt_to_page(addr); | ||
107 | } | ||
108 | |||
97 | static void flush_all_zero_pkmaps(void) | 109 | static void flush_all_zero_pkmaps(void) |
98 | { | 110 | { |
99 | int i; | 111 | int i; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e198831276a3..bc727122dd44 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -24,17 +24,20 @@ | |||
24 | 24 | ||
25 | #include <asm/page.h> | 25 | #include <asm/page.h> |
26 | #include <asm/pgtable.h> | 26 | #include <asm/pgtable.h> |
27 | #include <linux/io.h> | 27 | #include <asm/tlb.h> |
28 | 28 | ||
29 | #include <linux/io.h> | ||
29 | #include <linux/hugetlb.h> | 30 | #include <linux/hugetlb.h> |
31 | #include <linux/hugetlb_cgroup.h> | ||
30 | #include <linux/node.h> | 32 | #include <linux/node.h> |
33 | #include <linux/hugetlb_cgroup.h> | ||
31 | #include "internal.h" | 34 | #include "internal.h" |
32 | 35 | ||
33 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 36 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
34 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 37 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
35 | unsigned long hugepages_treat_as_movable; | 38 | unsigned long hugepages_treat_as_movable; |
36 | 39 | ||
37 | static int max_hstate; | 40 | int hugetlb_max_hstate __read_mostly; |
38 | unsigned int default_hstate_idx; | 41 | unsigned int default_hstate_idx; |
39 | struct hstate hstates[HUGE_MAX_HSTATE]; | 42 | struct hstate hstates[HUGE_MAX_HSTATE]; |
40 | 43 | ||
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate; | |||
45 | static unsigned long __initdata default_hstate_max_huge_pages; | 48 | static unsigned long __initdata default_hstate_max_huge_pages; |
46 | static unsigned long __initdata default_hstate_size; | 49 | static unsigned long __initdata default_hstate_size; |
47 | 50 | ||
48 | #define for_each_hstate(h) \ | ||
49 | for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) | ||
50 | |||
51 | /* | 51 | /* |
52 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 52 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
53 | */ | 53 | */ |
54 | static DEFINE_SPINLOCK(hugetlb_lock); | 54 | DEFINE_SPINLOCK(hugetlb_lock); |
55 | 55 | ||
56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | 56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) |
57 | { | 57 | { |
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src) | |||
509 | static void enqueue_huge_page(struct hstate *h, struct page *page) | 509 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
510 | { | 510 | { |
511 | int nid = page_to_nid(page); | 511 | int nid = page_to_nid(page); |
512 | list_add(&page->lru, &h->hugepage_freelists[nid]); | 512 | list_move(&page->lru, &h->hugepage_freelists[nid]); |
513 | h->free_huge_pages++; | 513 | h->free_huge_pages++; |
514 | h->free_huge_pages_node[nid]++; | 514 | h->free_huge_pages_node[nid]++; |
515 | } | 515 | } |
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) | |||
521 | if (list_empty(&h->hugepage_freelists[nid])) | 521 | if (list_empty(&h->hugepage_freelists[nid])) |
522 | return NULL; | 522 | return NULL; |
523 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); | 523 | page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); |
524 | list_del(&page->lru); | 524 | list_move(&page->lru, &h->hugepage_activelist); |
525 | set_page_refcounted(page); | 525 | set_page_refcounted(page); |
526 | h->free_huge_pages--; | 526 | h->free_huge_pages--; |
527 | h->free_huge_pages_node[nid]--; | 527 | h->free_huge_pages_node[nid]--; |
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) | |||
593 | 1 << PG_active | 1 << PG_reserved | | 593 | 1 << PG_active | 1 << PG_reserved | |
594 | 1 << PG_private | 1 << PG_writeback); | 594 | 1 << PG_private | 1 << PG_writeback); |
595 | } | 595 | } |
596 | VM_BUG_ON(hugetlb_cgroup_from_page(page)); | ||
596 | set_compound_page_dtor(page, NULL); | 597 | set_compound_page_dtor(page, NULL); |
597 | set_page_refcounted(page); | 598 | set_page_refcounted(page); |
598 | arch_release_hugepage(page); | 599 | arch_release_hugepage(page); |
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page) | |||
625 | page->mapping = NULL; | 626 | page->mapping = NULL; |
626 | BUG_ON(page_count(page)); | 627 | BUG_ON(page_count(page)); |
627 | BUG_ON(page_mapcount(page)); | 628 | BUG_ON(page_mapcount(page)); |
628 | INIT_LIST_HEAD(&page->lru); | ||
629 | 629 | ||
630 | spin_lock(&hugetlb_lock); | 630 | spin_lock(&hugetlb_lock); |
631 | hugetlb_cgroup_uncharge_page(hstate_index(h), | ||
632 | pages_per_huge_page(h), page); | ||
631 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { | 633 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { |
634 | /* remove the page from active list */ | ||
635 | list_del(&page->lru); | ||
632 | update_and_free_page(h, page); | 636 | update_and_free_page(h, page); |
633 | h->surplus_huge_pages--; | 637 | h->surplus_huge_pages--; |
634 | h->surplus_huge_pages_node[nid]--; | 638 | h->surplus_huge_pages_node[nid]--; |
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page) | |||
641 | 645 | ||
642 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 646 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
643 | { | 647 | { |
648 | INIT_LIST_HEAD(&page->lru); | ||
644 | set_compound_page_dtor(page, free_huge_page); | 649 | set_compound_page_dtor(page, free_huge_page); |
645 | spin_lock(&hugetlb_lock); | 650 | spin_lock(&hugetlb_lock); |
651 | set_hugetlb_cgroup(page, NULL); | ||
646 | h->nr_huge_pages++; | 652 | h->nr_huge_pages++; |
647 | h->nr_huge_pages_node[nid]++; | 653 | h->nr_huge_pages_node[nid]++; |
648 | spin_unlock(&hugetlb_lock); | 654 | spin_unlock(&hugetlb_lock); |
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
889 | 895 | ||
890 | spin_lock(&hugetlb_lock); | 896 | spin_lock(&hugetlb_lock); |
891 | if (page) { | 897 | if (page) { |
898 | INIT_LIST_HEAD(&page->lru); | ||
892 | r_nid = page_to_nid(page); | 899 | r_nid = page_to_nid(page); |
893 | set_compound_page_dtor(page, free_huge_page); | 900 | set_compound_page_dtor(page, free_huge_page); |
901 | set_hugetlb_cgroup(page, NULL); | ||
894 | /* | 902 | /* |
895 | * We incremented the global counters already | 903 | * We incremented the global counters already |
896 | */ | 904 | */ |
@@ -993,7 +1001,6 @@ retry: | |||
993 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1001 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
994 | if ((--needed) < 0) | 1002 | if ((--needed) < 0) |
995 | break; | 1003 | break; |
996 | list_del(&page->lru); | ||
997 | /* | 1004 | /* |
998 | * This page is now managed by the hugetlb allocator and has | 1005 | * This page is now managed by the hugetlb allocator and has |
999 | * no users -- drop the buddy allocator's reference. | 1006 | * no users -- drop the buddy allocator's reference. |
@@ -1008,7 +1015,6 @@ free: | |||
1008 | /* Free unnecessary surplus pages to the buddy allocator */ | 1015 | /* Free unnecessary surplus pages to the buddy allocator */ |
1009 | if (!list_empty(&surplus_list)) { | 1016 | if (!list_empty(&surplus_list)) { |
1010 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1017 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
1011 | list_del(&page->lru); | ||
1012 | put_page(page); | 1018 | put_page(page); |
1013 | } | 1019 | } |
1014 | } | 1020 | } |
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1112 | struct hstate *h = hstate_vma(vma); | 1118 | struct hstate *h = hstate_vma(vma); |
1113 | struct page *page; | 1119 | struct page *page; |
1114 | long chg; | 1120 | long chg; |
1121 | int ret, idx; | ||
1122 | struct hugetlb_cgroup *h_cg; | ||
1115 | 1123 | ||
1124 | idx = hstate_index(h); | ||
1116 | /* | 1125 | /* |
1117 | * Processes that did not create the mapping will have no | 1126 | * Processes that did not create the mapping will have no |
1118 | * reserves and will not have accounted against subpool | 1127 | * reserves and will not have accounted against subpool |
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1123 | */ | 1132 | */ |
1124 | chg = vma_needs_reservation(h, vma, addr); | 1133 | chg = vma_needs_reservation(h, vma, addr); |
1125 | if (chg < 0) | 1134 | if (chg < 0) |
1126 | return ERR_PTR(-VM_FAULT_OOM); | 1135 | return ERR_PTR(-ENOMEM); |
1127 | if (chg) | 1136 | if (chg) |
1128 | if (hugepage_subpool_get_pages(spool, chg)) | 1137 | if (hugepage_subpool_get_pages(spool, chg)) |
1129 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1138 | return ERR_PTR(-ENOSPC); |
1130 | 1139 | ||
1140 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); | ||
1141 | if (ret) { | ||
1142 | hugepage_subpool_put_pages(spool, chg); | ||
1143 | return ERR_PTR(-ENOSPC); | ||
1144 | } | ||
1131 | spin_lock(&hugetlb_lock); | 1145 | spin_lock(&hugetlb_lock); |
1132 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); | 1146 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); |
1133 | spin_unlock(&hugetlb_lock); | 1147 | if (page) { |
1134 | 1148 | /* update page cgroup details */ | |
1135 | if (!page) { | 1149 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), |
1150 | h_cg, page); | ||
1151 | spin_unlock(&hugetlb_lock); | ||
1152 | } else { | ||
1153 | spin_unlock(&hugetlb_lock); | ||
1136 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1154 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1137 | if (!page) { | 1155 | if (!page) { |
1156 | hugetlb_cgroup_uncharge_cgroup(idx, | ||
1157 | pages_per_huge_page(h), | ||
1158 | h_cg); | ||
1138 | hugepage_subpool_put_pages(spool, chg); | 1159 | hugepage_subpool_put_pages(spool, chg); |
1139 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1160 | return ERR_PTR(-ENOSPC); |
1140 | } | 1161 | } |
1162 | spin_lock(&hugetlb_lock); | ||
1163 | hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), | ||
1164 | h_cg, page); | ||
1165 | list_move(&page->lru, &h->hugepage_activelist); | ||
1166 | spin_unlock(&hugetlb_lock); | ||
1141 | } | 1167 | } |
1142 | 1168 | ||
1143 | set_page_private(page, (unsigned long)spool); | 1169 | set_page_private(page, (unsigned long)spool); |
1144 | 1170 | ||
1145 | vma_commit_reservation(h, vma, addr); | 1171 | vma_commit_reservation(h, vma, addr); |
1146 | |||
1147 | return page; | 1172 | return page; |
1148 | } | 1173 | } |
1149 | 1174 | ||
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, | |||
1646 | struct attribute_group *hstate_attr_group) | 1671 | struct attribute_group *hstate_attr_group) |
1647 | { | 1672 | { |
1648 | int retval; | 1673 | int retval; |
1649 | int hi = h - hstates; | 1674 | int hi = hstate_index(h); |
1650 | 1675 | ||
1651 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); | 1676 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); |
1652 | if (!hstate_kobjs[hi]) | 1677 | if (!hstate_kobjs[hi]) |
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node) | |||
1741 | if (!nhs->hugepages_kobj) | 1766 | if (!nhs->hugepages_kobj) |
1742 | return; /* no hstate attributes */ | 1767 | return; /* no hstate attributes */ |
1743 | 1768 | ||
1744 | for_each_hstate(h) | 1769 | for_each_hstate(h) { |
1745 | if (nhs->hstate_kobjs[h - hstates]) { | 1770 | int idx = hstate_index(h); |
1746 | kobject_put(nhs->hstate_kobjs[h - hstates]); | 1771 | if (nhs->hstate_kobjs[idx]) { |
1747 | nhs->hstate_kobjs[h - hstates] = NULL; | 1772 | kobject_put(nhs->hstate_kobjs[idx]); |
1773 | nhs->hstate_kobjs[idx] = NULL; | ||
1748 | } | 1774 | } |
1775 | } | ||
1749 | 1776 | ||
1750 | kobject_put(nhs->hugepages_kobj); | 1777 | kobject_put(nhs->hugepages_kobj); |
1751 | nhs->hugepages_kobj = NULL; | 1778 | nhs->hugepages_kobj = NULL; |
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void) | |||
1848 | hugetlb_unregister_all_nodes(); | 1875 | hugetlb_unregister_all_nodes(); |
1849 | 1876 | ||
1850 | for_each_hstate(h) { | 1877 | for_each_hstate(h) { |
1851 | kobject_put(hstate_kobjs[h - hstates]); | 1878 | kobject_put(hstate_kobjs[hstate_index(h)]); |
1852 | } | 1879 | } |
1853 | 1880 | ||
1854 | kobject_put(hugepages_kobj); | 1881 | kobject_put(hugepages_kobj); |
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void) | |||
1869 | if (!size_to_hstate(default_hstate_size)) | 1896 | if (!size_to_hstate(default_hstate_size)) |
1870 | hugetlb_add_hstate(HUGETLB_PAGE_ORDER); | 1897 | hugetlb_add_hstate(HUGETLB_PAGE_ORDER); |
1871 | } | 1898 | } |
1872 | default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; | 1899 | default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); |
1873 | if (default_hstate_max_huge_pages) | 1900 | if (default_hstate_max_huge_pages) |
1874 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; | 1901 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; |
1875 | 1902 | ||
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1897 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); | 1924 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); |
1898 | return; | 1925 | return; |
1899 | } | 1926 | } |
1900 | BUG_ON(max_hstate >= HUGE_MAX_HSTATE); | 1927 | BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); |
1901 | BUG_ON(order == 0); | 1928 | BUG_ON(order == 0); |
1902 | h = &hstates[max_hstate++]; | 1929 | h = &hstates[hugetlb_max_hstate++]; |
1903 | h->order = order; | 1930 | h->order = order; |
1904 | h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); | 1931 | h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); |
1905 | h->nr_huge_pages = 0; | 1932 | h->nr_huge_pages = 0; |
1906 | h->free_huge_pages = 0; | 1933 | h->free_huge_pages = 0; |
1907 | for (i = 0; i < MAX_NUMNODES; ++i) | 1934 | for (i = 0; i < MAX_NUMNODES; ++i) |
1908 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1935 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
1936 | INIT_LIST_HEAD(&h->hugepage_activelist); | ||
1909 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); | 1937 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); |
1910 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); | 1938 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); |
1911 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1939 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
1912 | huge_page_size(h)/1024); | 1940 | huge_page_size(h)/1024); |
1941 | /* | ||
1942 | * Add cgroup control files only if the huge page consists | ||
1943 | * of more than two normal pages. This is because we use | ||
1944 | * page[2].lru.next for storing cgoup details. | ||
1945 | */ | ||
1946 | if (order >= HUGETLB_CGROUP_MIN_ORDER) | ||
1947 | hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); | ||
1913 | 1948 | ||
1914 | parsed_hstate = h; | 1949 | parsed_hstate = h; |
1915 | } | 1950 | } |
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s) | |||
1920 | static unsigned long *last_mhp; | 1955 | static unsigned long *last_mhp; |
1921 | 1956 | ||
1922 | /* | 1957 | /* |
1923 | * !max_hstate means we haven't parsed a hugepagesz= parameter yet, | 1958 | * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, |
1924 | * so this hugepages= parameter goes to the "default hstate". | 1959 | * so this hugepages= parameter goes to the "default hstate". |
1925 | */ | 1960 | */ |
1926 | if (!max_hstate) | 1961 | if (!hugetlb_max_hstate) |
1927 | mhp = &default_hstate_max_huge_pages; | 1962 | mhp = &default_hstate_max_huge_pages; |
1928 | else | 1963 | else |
1929 | mhp = &parsed_hstate->max_huge_pages; | 1964 | mhp = &parsed_hstate->max_huge_pages; |
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s) | |||
1942 | * But we need to allocate >= MAX_ORDER hstates here early to still | 1977 | * But we need to allocate >= MAX_ORDER hstates here early to still |
1943 | * use the bootmem allocator. | 1978 | * use the bootmem allocator. |
1944 | */ | 1979 | */ |
1945 | if (max_hstate && parsed_hstate->order >= MAX_ORDER) | 1980 | if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) |
1946 | hugetlb_hstate_alloc_pages(parsed_hstate); | 1981 | hugetlb_hstate_alloc_pages(parsed_hstate); |
1947 | 1982 | ||
1948 | last_mhp = mhp; | 1983 | last_mhp = mhp; |
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) | |||
2308 | return 0; | 2343 | return 0; |
2309 | } | 2344 | } |
2310 | 2345 | ||
2311 | void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 2346 | void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, |
2312 | unsigned long end, struct page *ref_page) | 2347 | unsigned long start, unsigned long end, |
2348 | struct page *ref_page) | ||
2313 | { | 2349 | { |
2350 | int force_flush = 0; | ||
2314 | struct mm_struct *mm = vma->vm_mm; | 2351 | struct mm_struct *mm = vma->vm_mm; |
2315 | unsigned long address; | 2352 | unsigned long address; |
2316 | pte_t *ptep; | 2353 | pte_t *ptep; |
2317 | pte_t pte; | 2354 | pte_t pte; |
2318 | struct page *page; | 2355 | struct page *page; |
2319 | struct page *tmp; | ||
2320 | struct hstate *h = hstate_vma(vma); | 2356 | struct hstate *h = hstate_vma(vma); |
2321 | unsigned long sz = huge_page_size(h); | 2357 | unsigned long sz = huge_page_size(h); |
2322 | 2358 | ||
2323 | /* | ||
2324 | * A page gathering list, protected by per file i_mmap_mutex. The | ||
2325 | * lock is used to avoid list corruption from multiple unmapping | ||
2326 | * of the same page since we are using page->lru. | ||
2327 | */ | ||
2328 | LIST_HEAD(page_list); | ||
2329 | |||
2330 | WARN_ON(!is_vm_hugetlb_page(vma)); | 2359 | WARN_ON(!is_vm_hugetlb_page(vma)); |
2331 | BUG_ON(start & ~huge_page_mask(h)); | 2360 | BUG_ON(start & ~huge_page_mask(h)); |
2332 | BUG_ON(end & ~huge_page_mask(h)); | 2361 | BUG_ON(end & ~huge_page_mask(h)); |
2333 | 2362 | ||
2363 | tlb_start_vma(tlb, vma); | ||
2334 | mmu_notifier_invalidate_range_start(mm, start, end); | 2364 | mmu_notifier_invalidate_range_start(mm, start, end); |
2365 | again: | ||
2335 | spin_lock(&mm->page_table_lock); | 2366 | spin_lock(&mm->page_table_lock); |
2336 | for (address = start; address < end; address += sz) { | 2367 | for (address = start; address < end; address += sz) { |
2337 | ptep = huge_pte_offset(mm, address); | 2368 | ptep = huge_pte_offset(mm, address); |
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2370 | } | 2401 | } |
2371 | 2402 | ||
2372 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 2403 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
2404 | tlb_remove_tlb_entry(tlb, ptep, address); | ||
2373 | if (pte_dirty(pte)) | 2405 | if (pte_dirty(pte)) |
2374 | set_page_dirty(page); | 2406 | set_page_dirty(page); |
2375 | list_add(&page->lru, &page_list); | ||
2376 | 2407 | ||
2408 | page_remove_rmap(page); | ||
2409 | force_flush = !__tlb_remove_page(tlb, page); | ||
2410 | if (force_flush) | ||
2411 | break; | ||
2377 | /* Bail out after unmapping reference page if supplied */ | 2412 | /* Bail out after unmapping reference page if supplied */ |
2378 | if (ref_page) | 2413 | if (ref_page) |
2379 | break; | 2414 | break; |
2380 | } | 2415 | } |
2381 | flush_tlb_range(vma, start, end); | ||
2382 | spin_unlock(&mm->page_table_lock); | 2416 | spin_unlock(&mm->page_table_lock); |
2383 | mmu_notifier_invalidate_range_end(mm, start, end); | 2417 | /* |
2384 | list_for_each_entry_safe(page, tmp, &page_list, lru) { | 2418 | * mmu_gather ran out of room to batch pages, we break out of |
2385 | page_remove_rmap(page); | 2419 | * the PTE lock to avoid doing the potential expensive TLB invalidate |
2386 | list_del(&page->lru); | 2420 | * and page-free while holding it. |
2387 | put_page(page); | 2421 | */ |
2422 | if (force_flush) { | ||
2423 | force_flush = 0; | ||
2424 | tlb_flush_mmu(tlb); | ||
2425 | if (address < end && !ref_page) | ||
2426 | goto again; | ||
2388 | } | 2427 | } |
2428 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
2429 | tlb_end_vma(tlb, vma); | ||
2430 | } | ||
2431 | |||
2432 | void __unmap_hugepage_range_final(struct mmu_gather *tlb, | ||
2433 | struct vm_area_struct *vma, unsigned long start, | ||
2434 | unsigned long end, struct page *ref_page) | ||
2435 | { | ||
2436 | __unmap_hugepage_range(tlb, vma, start, end, ref_page); | ||
2437 | |||
2438 | /* | ||
2439 | * Clear this flag so that x86's huge_pmd_share page_table_shareable | ||
2440 | * test will fail on a vma being torn down, and not grab a page table | ||
2441 | * on its way out. We're lucky that the flag has such an appropriate | ||
2442 | * name, and can in fact be safely cleared here. We could clear it | ||
2443 | * before the __unmap_hugepage_range above, but all that's necessary | ||
2444 | * is to clear it before releasing the i_mmap_mutex. This works | ||
2445 | * because in the context this is called, the VMA is about to be | ||
2446 | * destroyed and the i_mmap_mutex is held. | ||
2447 | */ | ||
2448 | vma->vm_flags &= ~VM_MAYSHARE; | ||
2389 | } | 2449 | } |
2390 | 2450 | ||
2391 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 2451 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, |
2392 | unsigned long end, struct page *ref_page) | 2452 | unsigned long end, struct page *ref_page) |
2393 | { | 2453 | { |
2394 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2454 | struct mm_struct *mm; |
2395 | __unmap_hugepage_range(vma, start, end, ref_page); | 2455 | struct mmu_gather tlb; |
2396 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 2456 | |
2457 | mm = vma->vm_mm; | ||
2458 | |||
2459 | tlb_gather_mmu(&tlb, mm, 0); | ||
2460 | __unmap_hugepage_range(&tlb, vma, start, end, ref_page); | ||
2461 | tlb_finish_mmu(&tlb, start, end); | ||
2397 | } | 2462 | } |
2398 | 2463 | ||
2399 | /* | 2464 | /* |
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2438 | * from the time of fork. This would look like data corruption | 2503 | * from the time of fork. This would look like data corruption |
2439 | */ | 2504 | */ |
2440 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 2505 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
2441 | __unmap_hugepage_range(iter_vma, | 2506 | unmap_hugepage_range(iter_vma, address, |
2442 | address, address + huge_page_size(h), | 2507 | address + huge_page_size(h), page); |
2443 | page); | ||
2444 | } | 2508 | } |
2445 | mutex_unlock(&mapping->i_mmap_mutex); | 2509 | mutex_unlock(&mapping->i_mmap_mutex); |
2446 | 2510 | ||
@@ -2496,6 +2560,7 @@ retry_avoidcopy: | |||
2496 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2560 | new_page = alloc_huge_page(vma, address, outside_reserve); |
2497 | 2561 | ||
2498 | if (IS_ERR(new_page)) { | 2562 | if (IS_ERR(new_page)) { |
2563 | long err = PTR_ERR(new_page); | ||
2499 | page_cache_release(old_page); | 2564 | page_cache_release(old_page); |
2500 | 2565 | ||
2501 | /* | 2566 | /* |
@@ -2524,7 +2589,10 @@ retry_avoidcopy: | |||
2524 | 2589 | ||
2525 | /* Caller expects lock to be held */ | 2590 | /* Caller expects lock to be held */ |
2526 | spin_lock(&mm->page_table_lock); | 2591 | spin_lock(&mm->page_table_lock); |
2527 | return -PTR_ERR(new_page); | 2592 | if (err == -ENOMEM) |
2593 | return VM_FAULT_OOM; | ||
2594 | else | ||
2595 | return VM_FAULT_SIGBUS; | ||
2528 | } | 2596 | } |
2529 | 2597 | ||
2530 | /* | 2598 | /* |
@@ -2642,7 +2710,11 @@ retry: | |||
2642 | goto out; | 2710 | goto out; |
2643 | page = alloc_huge_page(vma, address, 0); | 2711 | page = alloc_huge_page(vma, address, 0); |
2644 | if (IS_ERR(page)) { | 2712 | if (IS_ERR(page)) { |
2645 | ret = -PTR_ERR(page); | 2713 | ret = PTR_ERR(page); |
2714 | if (ret == -ENOMEM) | ||
2715 | ret = VM_FAULT_OOM; | ||
2716 | else | ||
2717 | ret = VM_FAULT_SIGBUS; | ||
2646 | goto out; | 2718 | goto out; |
2647 | } | 2719 | } |
2648 | clear_huge_page(page, address, pages_per_huge_page(h)); | 2720 | clear_huge_page(page, address, pages_per_huge_page(h)); |
@@ -2679,7 +2751,7 @@ retry: | |||
2679 | */ | 2751 | */ |
2680 | if (unlikely(PageHWPoison(page))) { | 2752 | if (unlikely(PageHWPoison(page))) { |
2681 | ret = VM_FAULT_HWPOISON | | 2753 | ret = VM_FAULT_HWPOISON | |
2682 | VM_FAULT_SET_HINDEX(h - hstates); | 2754 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
2683 | goto backout_unlocked; | 2755 | goto backout_unlocked; |
2684 | } | 2756 | } |
2685 | } | 2757 | } |
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2752 | return 0; | 2824 | return 0; |
2753 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2825 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) |
2754 | return VM_FAULT_HWPOISON_LARGE | | 2826 | return VM_FAULT_HWPOISON_LARGE | |
2755 | VM_FAULT_SET_HINDEX(h - hstates); | 2827 | VM_FAULT_SET_HINDEX(hstate_index(h)); |
2756 | } | 2828 | } |
2757 | 2829 | ||
2758 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); | 2830 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
2959 | } | 3031 | } |
2960 | } | 3032 | } |
2961 | spin_unlock(&mm->page_table_lock); | 3033 | spin_unlock(&mm->page_table_lock); |
2962 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3034 | /* |
2963 | 3035 | * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare | |
3036 | * may have cleared our pud entry and done put_page on the page table: | ||
3037 | * once we release i_mmap_mutex, another task can do the final put_page | ||
3038 | * and that page table be reused and filled with junk. | ||
3039 | */ | ||
2964 | flush_tlb_range(vma, start, end); | 3040 | flush_tlb_range(vma, start, end); |
3041 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | ||
2965 | } | 3042 | } |
2966 | 3043 | ||
2967 | int hugetlb_reserve_pages(struct inode *inode, | 3044 | int hugetlb_reserve_pages(struct inode *inode, |
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c new file mode 100644 index 000000000000..a3f358fb8a0c --- /dev/null +++ b/mm/hugetlb_cgroup.c | |||
@@ -0,0 +1,418 @@ | |||
1 | /* | ||
2 | * | ||
3 | * Copyright IBM Corporation, 2012 | ||
4 | * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify it | ||
7 | * under the terms of version 2.1 of the GNU Lesser General Public License | ||
8 | * as published by the Free Software Foundation. | ||
9 | * | ||
10 | * This program is distributed in the hope that it would be useful, but | ||
11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/cgroup.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <linux/hugetlb.h> | ||
19 | #include <linux/hugetlb_cgroup.h> | ||
20 | |||
21 | struct hugetlb_cgroup { | ||
22 | struct cgroup_subsys_state css; | ||
23 | /* | ||
24 | * the counter to account for hugepages from hugetlb. | ||
25 | */ | ||
26 | struct res_counter hugepage[HUGE_MAX_HSTATE]; | ||
27 | }; | ||
28 | |||
29 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | ||
30 | #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) | ||
31 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | ||
32 | |||
33 | struct cgroup_subsys hugetlb_subsys __read_mostly; | ||
34 | static struct hugetlb_cgroup *root_h_cgroup __read_mostly; | ||
35 | |||
36 | static inline | ||
37 | struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) | ||
38 | { | ||
39 | return container_of(s, struct hugetlb_cgroup, css); | ||
40 | } | ||
41 | |||
42 | static inline | ||
43 | struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup) | ||
44 | { | ||
45 | return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup, | ||
46 | hugetlb_subsys_id)); | ||
47 | } | ||
48 | |||
49 | static inline | ||
50 | struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) | ||
51 | { | ||
52 | return hugetlb_cgroup_from_css(task_subsys_state(task, | ||
53 | hugetlb_subsys_id)); | ||
54 | } | ||
55 | |||
56 | static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) | ||
57 | { | ||
58 | return (h_cg == root_h_cgroup); | ||
59 | } | ||
60 | |||
61 | static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg) | ||
62 | { | ||
63 | if (!cg->parent) | ||
64 | return NULL; | ||
65 | return hugetlb_cgroup_from_cgroup(cg->parent); | ||
66 | } | ||
67 | |||
68 | static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg) | ||
69 | { | ||
70 | int idx; | ||
71 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg); | ||
72 | |||
73 | for (idx = 0; idx < hugetlb_max_hstate; idx++) { | ||
74 | if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) | ||
75 | return true; | ||
76 | } | ||
77 | return false; | ||
78 | } | ||
79 | |||
80 | static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) | ||
81 | { | ||
82 | int idx; | ||
83 | struct cgroup *parent_cgroup; | ||
84 | struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup; | ||
85 | |||
86 | h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); | ||
87 | if (!h_cgroup) | ||
88 | return ERR_PTR(-ENOMEM); | ||
89 | |||
90 | parent_cgroup = cgroup->parent; | ||
91 | if (parent_cgroup) { | ||
92 | parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup); | ||
93 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) | ||
94 | res_counter_init(&h_cgroup->hugepage[idx], | ||
95 | &parent_h_cgroup->hugepage[idx]); | ||
96 | } else { | ||
97 | root_h_cgroup = h_cgroup; | ||
98 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) | ||
99 | res_counter_init(&h_cgroup->hugepage[idx], NULL); | ||
100 | } | ||
101 | return &h_cgroup->css; | ||
102 | } | ||
103 | |||
104 | static void hugetlb_cgroup_destroy(struct cgroup *cgroup) | ||
105 | { | ||
106 | struct hugetlb_cgroup *h_cgroup; | ||
107 | |||
108 | h_cgroup = hugetlb_cgroup_from_cgroup(cgroup); | ||
109 | kfree(h_cgroup); | ||
110 | } | ||
111 | |||
112 | |||
113 | /* | ||
114 | * Should be called with hugetlb_lock held. | ||
115 | * Since we are holding hugetlb_lock, pages cannot get moved from | ||
116 | * active list or uncharged from the cgroup, So no need to get | ||
117 | * page reference and test for page active here. This function | ||
118 | * cannot fail. | ||
119 | */ | ||
120 | static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup, | ||
121 | struct page *page) | ||
122 | { | ||
123 | int csize; | ||
124 | struct res_counter *counter; | ||
125 | struct res_counter *fail_res; | ||
126 | struct hugetlb_cgroup *page_hcg; | ||
127 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
128 | struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup); | ||
129 | |||
130 | page_hcg = hugetlb_cgroup_from_page(page); | ||
131 | /* | ||
132 | * We can have pages in active list without any cgroup | ||
133 | * ie, hugepage with less than 3 pages. We can safely | ||
134 | * ignore those pages. | ||
135 | */ | ||
136 | if (!page_hcg || page_hcg != h_cg) | ||
137 | goto out; | ||
138 | |||
139 | csize = PAGE_SIZE << compound_order(page); | ||
140 | if (!parent) { | ||
141 | parent = root_h_cgroup; | ||
142 | /* root has no limit */ | ||
143 | res_counter_charge_nofail(&parent->hugepage[idx], | ||
144 | csize, &fail_res); | ||
145 | } | ||
146 | counter = &h_cg->hugepage[idx]; | ||
147 | res_counter_uncharge_until(counter, counter->parent, csize); | ||
148 | |||
149 | set_hugetlb_cgroup(page, parent); | ||
150 | out: | ||
151 | return; | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * Force the hugetlb cgroup to empty the hugetlb resources by moving them to | ||
156 | * the parent cgroup. | ||
157 | */ | ||
158 | static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) | ||
159 | { | ||
160 | struct hstate *h; | ||
161 | struct page *page; | ||
162 | int ret = 0, idx = 0; | ||
163 | |||
164 | do { | ||
165 | if (cgroup_task_count(cgroup) || | ||
166 | !list_empty(&cgroup->children)) { | ||
167 | ret = -EBUSY; | ||
168 | goto out; | ||
169 | } | ||
170 | for_each_hstate(h) { | ||
171 | spin_lock(&hugetlb_lock); | ||
172 | list_for_each_entry(page, &h->hugepage_activelist, lru) | ||
173 | hugetlb_cgroup_move_parent(idx, cgroup, page); | ||
174 | |||
175 | spin_unlock(&hugetlb_lock); | ||
176 | idx++; | ||
177 | } | ||
178 | cond_resched(); | ||
179 | } while (hugetlb_cgroup_have_usage(cgroup)); | ||
180 | out: | ||
181 | return ret; | ||
182 | } | ||
183 | |||
184 | int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, | ||
185 | struct hugetlb_cgroup **ptr) | ||
186 | { | ||
187 | int ret = 0; | ||
188 | struct res_counter *fail_res; | ||
189 | struct hugetlb_cgroup *h_cg = NULL; | ||
190 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
191 | |||
192 | if (hugetlb_cgroup_disabled()) | ||
193 | goto done; | ||
194 | /* | ||
195 | * We don't charge any cgroup if the compound page have less | ||
196 | * than 3 pages. | ||
197 | */ | ||
198 | if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) | ||
199 | goto done; | ||
200 | again: | ||
201 | rcu_read_lock(); | ||
202 | h_cg = hugetlb_cgroup_from_task(current); | ||
203 | if (!css_tryget(&h_cg->css)) { | ||
204 | rcu_read_unlock(); | ||
205 | goto again; | ||
206 | } | ||
207 | rcu_read_unlock(); | ||
208 | |||
209 | ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res); | ||
210 | css_put(&h_cg->css); | ||
211 | done: | ||
212 | *ptr = h_cg; | ||
213 | return ret; | ||
214 | } | ||
215 | |||
216 | /* Should be called with hugetlb_lock held */ | ||
217 | void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, | ||
218 | struct hugetlb_cgroup *h_cg, | ||
219 | struct page *page) | ||
220 | { | ||
221 | if (hugetlb_cgroup_disabled() || !h_cg) | ||
222 | return; | ||
223 | |||
224 | set_hugetlb_cgroup(page, h_cg); | ||
225 | return; | ||
226 | } | ||
227 | |||
228 | /* | ||
229 | * Should be called with hugetlb_lock held | ||
230 | */ | ||
231 | void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, | ||
232 | struct page *page) | ||
233 | { | ||
234 | struct hugetlb_cgroup *h_cg; | ||
235 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
236 | |||
237 | if (hugetlb_cgroup_disabled()) | ||
238 | return; | ||
239 | VM_BUG_ON(!spin_is_locked(&hugetlb_lock)); | ||
240 | h_cg = hugetlb_cgroup_from_page(page); | ||
241 | if (unlikely(!h_cg)) | ||
242 | return; | ||
243 | set_hugetlb_cgroup(page, NULL); | ||
244 | res_counter_uncharge(&h_cg->hugepage[idx], csize); | ||
245 | return; | ||
246 | } | ||
247 | |||
248 | void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, | ||
249 | struct hugetlb_cgroup *h_cg) | ||
250 | { | ||
251 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
252 | |||
253 | if (hugetlb_cgroup_disabled() || !h_cg) | ||
254 | return; | ||
255 | |||
256 | if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) | ||
257 | return; | ||
258 | |||
259 | res_counter_uncharge(&h_cg->hugepage[idx], csize); | ||
260 | return; | ||
261 | } | ||
262 | |||
263 | static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft, | ||
264 | struct file *file, char __user *buf, | ||
265 | size_t nbytes, loff_t *ppos) | ||
266 | { | ||
267 | u64 val; | ||
268 | char str[64]; | ||
269 | int idx, name, len; | ||
270 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
271 | |||
272 | idx = MEMFILE_IDX(cft->private); | ||
273 | name = MEMFILE_ATTR(cft->private); | ||
274 | |||
275 | val = res_counter_read_u64(&h_cg->hugepage[idx], name); | ||
276 | len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); | ||
277 | return simple_read_from_buffer(buf, nbytes, ppos, str, len); | ||
278 | } | ||
279 | |||
280 | static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft, | ||
281 | const char *buffer) | ||
282 | { | ||
283 | int idx, name, ret; | ||
284 | unsigned long long val; | ||
285 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
286 | |||
287 | idx = MEMFILE_IDX(cft->private); | ||
288 | name = MEMFILE_ATTR(cft->private); | ||
289 | |||
290 | switch (name) { | ||
291 | case RES_LIMIT: | ||
292 | if (hugetlb_cgroup_is_root(h_cg)) { | ||
293 | /* Can't set limit on root */ | ||
294 | ret = -EINVAL; | ||
295 | break; | ||
296 | } | ||
297 | /* This function does all necessary parse...reuse it */ | ||
298 | ret = res_counter_memparse_write_strategy(buffer, &val); | ||
299 | if (ret) | ||
300 | break; | ||
301 | ret = res_counter_set_limit(&h_cg->hugepage[idx], val); | ||
302 | break; | ||
303 | default: | ||
304 | ret = -EINVAL; | ||
305 | break; | ||
306 | } | ||
307 | return ret; | ||
308 | } | ||
309 | |||
310 | static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event) | ||
311 | { | ||
312 | int idx, name, ret = 0; | ||
313 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); | ||
314 | |||
315 | idx = MEMFILE_IDX(event); | ||
316 | name = MEMFILE_ATTR(event); | ||
317 | |||
318 | switch (name) { | ||
319 | case RES_MAX_USAGE: | ||
320 | res_counter_reset_max(&h_cg->hugepage[idx]); | ||
321 | break; | ||
322 | case RES_FAILCNT: | ||
323 | res_counter_reset_failcnt(&h_cg->hugepage[idx]); | ||
324 | break; | ||
325 | default: | ||
326 | ret = -EINVAL; | ||
327 | break; | ||
328 | } | ||
329 | return ret; | ||
330 | } | ||
331 | |||
332 | static char *mem_fmt(char *buf, int size, unsigned long hsize) | ||
333 | { | ||
334 | if (hsize >= (1UL << 30)) | ||
335 | snprintf(buf, size, "%luGB", hsize >> 30); | ||
336 | else if (hsize >= (1UL << 20)) | ||
337 | snprintf(buf, size, "%luMB", hsize >> 20); | ||
338 | else | ||
339 | snprintf(buf, size, "%luKB", hsize >> 10); | ||
340 | return buf; | ||
341 | } | ||
342 | |||
343 | int __init hugetlb_cgroup_file_init(int idx) | ||
344 | { | ||
345 | char buf[32]; | ||
346 | struct cftype *cft; | ||
347 | struct hstate *h = &hstates[idx]; | ||
348 | |||
349 | /* format the size */ | ||
350 | mem_fmt(buf, 32, huge_page_size(h)); | ||
351 | |||
352 | /* Add the limit file */ | ||
353 | cft = &h->cgroup_files[0]; | ||
354 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); | ||
355 | cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); | ||
356 | cft->read = hugetlb_cgroup_read; | ||
357 | cft->write_string = hugetlb_cgroup_write; | ||
358 | |||
359 | /* Add the usage file */ | ||
360 | cft = &h->cgroup_files[1]; | ||
361 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); | ||
362 | cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); | ||
363 | cft->read = hugetlb_cgroup_read; | ||
364 | |||
365 | /* Add the MAX usage file */ | ||
366 | cft = &h->cgroup_files[2]; | ||
367 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); | ||
368 | cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); | ||
369 | cft->trigger = hugetlb_cgroup_reset; | ||
370 | cft->read = hugetlb_cgroup_read; | ||
371 | |||
372 | /* Add the failcntfile */ | ||
373 | cft = &h->cgroup_files[3]; | ||
374 | snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); | ||
375 | cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); | ||
376 | cft->trigger = hugetlb_cgroup_reset; | ||
377 | cft->read = hugetlb_cgroup_read; | ||
378 | |||
379 | /* NULL terminate the last cft */ | ||
380 | cft = &h->cgroup_files[4]; | ||
381 | memset(cft, 0, sizeof(*cft)); | ||
382 | |||
383 | WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); | ||
384 | |||
385 | return 0; | ||
386 | } | ||
387 | |||
388 | /* | ||
389 | * hugetlb_lock will make sure a parallel cgroup rmdir won't happen | ||
390 | * when we migrate hugepages | ||
391 | */ | ||
392 | void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) | ||
393 | { | ||
394 | struct hugetlb_cgroup *h_cg; | ||
395 | struct hstate *h = page_hstate(oldhpage); | ||
396 | |||
397 | if (hugetlb_cgroup_disabled()) | ||
398 | return; | ||
399 | |||
400 | VM_BUG_ON(!PageHuge(oldhpage)); | ||
401 | spin_lock(&hugetlb_lock); | ||
402 | h_cg = hugetlb_cgroup_from_page(oldhpage); | ||
403 | set_hugetlb_cgroup(oldhpage, NULL); | ||
404 | |||
405 | /* move the h_cg details to new cgroup */ | ||
406 | set_hugetlb_cgroup(newhpage, h_cg); | ||
407 | list_move(&newhpage->lru, &h->hugepage_activelist); | ||
408 | spin_unlock(&hugetlb_lock); | ||
409 | return; | ||
410 | } | ||
411 | |||
412 | struct cgroup_subsys hugetlb_subsys = { | ||
413 | .name = "hugetlb", | ||
414 | .create = hugetlb_cgroup_create, | ||
415 | .pre_destroy = hugetlb_cgroup_pre_destroy, | ||
416 | .destroy = hugetlb_cgroup_destroy, | ||
417 | .subsys_id = hugetlb_subsys_id, | ||
418 | }; | ||
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index cc448bb983ba..3a61efc518d5 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -123,7 +123,7 @@ static int pfn_inject_init(void) | |||
123 | if (!dentry) | 123 | if (!dentry) |
124 | goto fail; | 124 | goto fail; |
125 | 125 | ||
126 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 126 | #ifdef CONFIG_MEMCG_SWAP |
127 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, | 127 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, |
128 | hwpoison_dir, &hwpoison_filter_memcg); | 128 | hwpoison_dir, &hwpoison_filter_memcg); |
129 | if (!dentry) | 129 | if (!dentry) |
diff --git a/mm/internal.h b/mm/internal.h index 2ba87fbfb75b..3314f79d775a 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -118,8 +118,14 @@ struct compact_control { | |||
118 | unsigned long nr_freepages; /* Number of isolated free pages */ | 118 | unsigned long nr_freepages; /* Number of isolated free pages */ |
119 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 119 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
120 | unsigned long free_pfn; /* isolate_freepages search base */ | 120 | unsigned long free_pfn; /* isolate_freepages search base */ |
121 | unsigned long start_free_pfn; /* where we started the search */ | ||
121 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 122 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
122 | bool sync; /* Synchronous migration */ | 123 | bool sync; /* Synchronous migration */ |
124 | bool wrapped; /* Order > 0 compactions are | ||
125 | incremental, once free_pfn | ||
126 | and migrate_pfn meet, we restart | ||
127 | from the top of the zone; | ||
128 | remember we wrapped around. */ | ||
123 | 129 | ||
124 | int order; /* order a direct compactor needs */ | 130 | int order; /* order a direct compactor needs */ |
125 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 131 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
@@ -347,3 +353,5 @@ extern u32 hwpoison_filter_enable; | |||
347 | extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, | 353 | extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, |
348 | unsigned long, unsigned long, | 354 | unsigned long, unsigned long, |
349 | unsigned long, unsigned long); | 355 | unsigned long, unsigned long); |
356 | |||
357 | extern void set_pageblock_order(void); | ||
diff --git a/mm/memblock.c b/mm/memblock.c index 5cc6731b00cc..4d9393c7edc9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -222,13 +222,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, | |||
222 | /* Try to find some space for it. | 222 | /* Try to find some space for it. |
223 | * | 223 | * |
224 | * WARNING: We assume that either slab_is_available() and we use it or | 224 | * WARNING: We assume that either slab_is_available() and we use it or |
225 | * we use MEMBLOCK for allocations. That means that this is unsafe to use | 225 | * we use MEMBLOCK for allocations. That means that this is unsafe to |
226 | * when bootmem is currently active (unless bootmem itself is implemented | 226 | * use when bootmem is currently active (unless bootmem itself is |
227 | * on top of MEMBLOCK which isn't the case yet) | 227 | * implemented on top of MEMBLOCK which isn't the case yet) |
228 | * | 228 | * |
229 | * This should however not be an issue for now, as we currently only | 229 | * This should however not be an issue for now, as we currently only |
230 | * call into MEMBLOCK while it's still active, or much later when slab is | 230 | * call into MEMBLOCK while it's still active, or much later when slab |
231 | * active for memory hotplug operations | 231 | * is active for memory hotplug operations |
232 | */ | 232 | */ |
233 | if (use_slab) { | 233 | if (use_slab) { |
234 | new_array = kmalloc(new_size, GFP_KERNEL); | 234 | new_array = kmalloc(new_size, GFP_KERNEL); |
@@ -243,8 +243,8 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, | |||
243 | new_alloc_size, PAGE_SIZE); | 243 | new_alloc_size, PAGE_SIZE); |
244 | if (!addr && new_area_size) | 244 | if (!addr && new_area_size) |
245 | addr = memblock_find_in_range(0, | 245 | addr = memblock_find_in_range(0, |
246 | min(new_area_start, memblock.current_limit), | 246 | min(new_area_start, memblock.current_limit), |
247 | new_alloc_size, PAGE_SIZE); | 247 | new_alloc_size, PAGE_SIZE); |
248 | 248 | ||
249 | new_array = addr ? __va(addr) : 0; | 249 | new_array = addr ? __va(addr) : 0; |
250 | } | 250 | } |
@@ -254,12 +254,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, | |||
254 | return -1; | 254 | return -1; |
255 | } | 255 | } |
256 | 256 | ||
257 | memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", | 257 | memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", |
258 | memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); | 258 | memblock_type_name(type), type->max * 2, (u64)addr, |
259 | (u64)addr + new_size - 1); | ||
259 | 260 | ||
260 | /* Found space, we now need to move the array over before | 261 | /* |
261 | * we add the reserved region since it may be our reserved | 262 | * Found space, we now need to move the array over before we add the |
262 | * array itself that is full. | 263 | * reserved region since it may be our reserved array itself that is |
264 | * full. | ||
263 | */ | 265 | */ |
264 | memcpy(new_array, type->regions, old_size); | 266 | memcpy(new_array, type->regions, old_size); |
265 | memset(new_array + type->max, 0, old_size); | 267 | memset(new_array + type->max, 0, old_size); |
@@ -267,17 +269,16 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, | |||
267 | type->regions = new_array; | 269 | type->regions = new_array; |
268 | type->max <<= 1; | 270 | type->max <<= 1; |
269 | 271 | ||
270 | /* Free old array. We needn't free it if the array is the | 272 | /* Free old array. We needn't free it if the array is the static one */ |
271 | * static one | ||
272 | */ | ||
273 | if (*in_slab) | 273 | if (*in_slab) |
274 | kfree(old_array); | 274 | kfree(old_array); |
275 | else if (old_array != memblock_memory_init_regions && | 275 | else if (old_array != memblock_memory_init_regions && |
276 | old_array != memblock_reserved_init_regions) | 276 | old_array != memblock_reserved_init_regions) |
277 | memblock_free(__pa(old_array), old_alloc_size); | 277 | memblock_free(__pa(old_array), old_alloc_size); |
278 | 278 | ||
279 | /* Reserve the new array if that comes from the memblock. | 279 | /* |
280 | * Otherwise, we needn't do it | 280 | * Reserve the new array if that comes from the memblock. Otherwise, we |
281 | * needn't do it | ||
281 | */ | 282 | */ |
282 | if (!use_slab) | 283 | if (!use_slab) |
283 | BUG_ON(memblock_reserve(addr, new_alloc_size)); | 284 | BUG_ON(memblock_reserve(addr, new_alloc_size)); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f72b5e52451a..795e525afaba 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -61,12 +61,12 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly; | |||
61 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 61 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
62 | static struct mem_cgroup *root_mem_cgroup __read_mostly; | 62 | static struct mem_cgroup *root_mem_cgroup __read_mostly; |
63 | 63 | ||
64 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 64 | #ifdef CONFIG_MEMCG_SWAP |
65 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 65 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
66 | int do_swap_account __read_mostly; | 66 | int do_swap_account __read_mostly; |
67 | 67 | ||
68 | /* for remember boot option*/ | 68 | /* for remember boot option*/ |
69 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED | 69 | #ifdef CONFIG_MEMCG_SWAP_ENABLED |
70 | static int really_do_swap_account __initdata = 1; | 70 | static int really_do_swap_account __initdata = 1; |
71 | #else | 71 | #else |
72 | static int really_do_swap_account __initdata = 0; | 72 | static int really_do_swap_account __initdata = 0; |
@@ -87,7 +87,7 @@ enum mem_cgroup_stat_index { | |||
87 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 87 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
88 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 88 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
90 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 90 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ |
91 | MEM_CGROUP_STAT_NSTATS, | 91 | MEM_CGROUP_STAT_NSTATS, |
92 | }; | 92 | }; |
93 | 93 | ||
@@ -378,9 +378,7 @@ static bool move_file(void) | |||
378 | 378 | ||
379 | enum charge_type { | 379 | enum charge_type { |
380 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 380 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
381 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 381 | MEM_CGROUP_CHARGE_TYPE_ANON, |
382 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ | ||
383 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ | ||
384 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ | 382 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ |
385 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ | 383 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ |
386 | NR_CHARGE_TYPE, | 384 | NR_CHARGE_TYPE, |
@@ -407,8 +405,14 @@ enum charge_type { | |||
407 | static void mem_cgroup_get(struct mem_cgroup *memcg); | 405 | static void mem_cgroup_get(struct mem_cgroup *memcg); |
408 | static void mem_cgroup_put(struct mem_cgroup *memcg); | 406 | static void mem_cgroup_put(struct mem_cgroup *memcg); |
409 | 407 | ||
408 | static inline | ||
409 | struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) | ||
410 | { | ||
411 | return container_of(s, struct mem_cgroup, css); | ||
412 | } | ||
413 | |||
410 | /* Writing them here to avoid exposing memcg's inner layout */ | 414 | /* Writing them here to avoid exposing memcg's inner layout */ |
411 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 415 | #ifdef CONFIG_MEMCG_KMEM |
412 | #include <net/sock.h> | 416 | #include <net/sock.h> |
413 | #include <net/ip.h> | 417 | #include <net/ip.h> |
414 | 418 | ||
@@ -467,9 +471,9 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | |||
467 | } | 471 | } |
468 | EXPORT_SYMBOL(tcp_proto_cgroup); | 472 | EXPORT_SYMBOL(tcp_proto_cgroup); |
469 | #endif /* CONFIG_INET */ | 473 | #endif /* CONFIG_INET */ |
470 | #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ | 474 | #endif /* CONFIG_MEMCG_KMEM */ |
471 | 475 | ||
472 | #if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) | 476 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) |
473 | static void disarm_sock_keys(struct mem_cgroup *memcg) | 477 | static void disarm_sock_keys(struct mem_cgroup *memcg) |
474 | { | 478 | { |
475 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) | 479 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) |
@@ -703,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | |||
703 | bool charge) | 707 | bool charge) |
704 | { | 708 | { |
705 | int val = (charge) ? 1 : -1; | 709 | int val = (charge) ? 1 : -1; |
706 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); | 710 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); |
707 | } | 711 | } |
708 | 712 | ||
709 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | 713 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, |
@@ -864,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
864 | 868 | ||
865 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 869 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
866 | { | 870 | { |
867 | return container_of(cgroup_subsys_state(cont, | 871 | return mem_cgroup_from_css( |
868 | mem_cgroup_subsys_id), struct mem_cgroup, | 872 | cgroup_subsys_state(cont, mem_cgroup_subsys_id)); |
869 | css); | ||
870 | } | 873 | } |
871 | 874 | ||
872 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | 875 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) |
@@ -879,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
879 | if (unlikely(!p)) | 882 | if (unlikely(!p)) |
880 | return NULL; | 883 | return NULL; |
881 | 884 | ||
882 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), | 885 | return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); |
883 | struct mem_cgroup, css); | ||
884 | } | 886 | } |
885 | 887 | ||
886 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | 888 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
@@ -966,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
966 | css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); | 968 | css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); |
967 | if (css) { | 969 | if (css) { |
968 | if (css == &root->css || css_tryget(css)) | 970 | if (css == &root->css || css_tryget(css)) |
969 | memcg = container_of(css, | 971 | memcg = mem_cgroup_from_css(css); |
970 | struct mem_cgroup, css); | ||
971 | } else | 972 | } else |
972 | id = 0; | 973 | id = 0; |
973 | rcu_read_unlock(); | 974 | rcu_read_unlock(); |
@@ -1454,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) | |||
1454 | /* | 1455 | /* |
1455 | * Return the memory (and swap, if configured) limit for a memcg. | 1456 | * Return the memory (and swap, if configured) limit for a memcg. |
1456 | */ | 1457 | */ |
1457 | u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | 1458 | static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) |
1458 | { | 1459 | { |
1459 | u64 limit; | 1460 | u64 limit; |
1460 | u64 memsw; | 1461 | u64 memsw; |
@@ -1470,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1470 | return min(limit, memsw); | 1471 | return min(limit, memsw); |
1471 | } | 1472 | } |
1472 | 1473 | ||
1474 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | ||
1475 | int order) | ||
1476 | { | ||
1477 | struct mem_cgroup *iter; | ||
1478 | unsigned long chosen_points = 0; | ||
1479 | unsigned long totalpages; | ||
1480 | unsigned int points = 0; | ||
1481 | struct task_struct *chosen = NULL; | ||
1482 | |||
1483 | /* | ||
1484 | * If current has a pending SIGKILL, then automatically select it. The | ||
1485 | * goal is to allow it to allocate so that it may quickly exit and free | ||
1486 | * its memory. | ||
1487 | */ | ||
1488 | if (fatal_signal_pending(current)) { | ||
1489 | set_thread_flag(TIF_MEMDIE); | ||
1490 | return; | ||
1491 | } | ||
1492 | |||
1493 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | ||
1494 | totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; | ||
1495 | for_each_mem_cgroup_tree(iter, memcg) { | ||
1496 | struct cgroup *cgroup = iter->css.cgroup; | ||
1497 | struct cgroup_iter it; | ||
1498 | struct task_struct *task; | ||
1499 | |||
1500 | cgroup_iter_start(cgroup, &it); | ||
1501 | while ((task = cgroup_iter_next(cgroup, &it))) { | ||
1502 | switch (oom_scan_process_thread(task, totalpages, NULL, | ||
1503 | false)) { | ||
1504 | case OOM_SCAN_SELECT: | ||
1505 | if (chosen) | ||
1506 | put_task_struct(chosen); | ||
1507 | chosen = task; | ||
1508 | chosen_points = ULONG_MAX; | ||
1509 | get_task_struct(chosen); | ||
1510 | /* fall through */ | ||
1511 | case OOM_SCAN_CONTINUE: | ||
1512 | continue; | ||
1513 | case OOM_SCAN_ABORT: | ||
1514 | cgroup_iter_end(cgroup, &it); | ||
1515 | mem_cgroup_iter_break(memcg, iter); | ||
1516 | if (chosen) | ||
1517 | put_task_struct(chosen); | ||
1518 | return; | ||
1519 | case OOM_SCAN_OK: | ||
1520 | break; | ||
1521 | }; | ||
1522 | points = oom_badness(task, memcg, NULL, totalpages); | ||
1523 | if (points > chosen_points) { | ||
1524 | if (chosen) | ||
1525 | put_task_struct(chosen); | ||
1526 | chosen = task; | ||
1527 | chosen_points = points; | ||
1528 | get_task_struct(chosen); | ||
1529 | } | ||
1530 | } | ||
1531 | cgroup_iter_end(cgroup, &it); | ||
1532 | } | ||
1533 | |||
1534 | if (!chosen) | ||
1535 | return; | ||
1536 | points = chosen_points * 1000 / totalpages; | ||
1537 | oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, | ||
1538 | NULL, "Memory cgroup out of memory"); | ||
1539 | } | ||
1540 | |||
1473 | static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | 1541 | static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, |
1474 | gfp_t gfp_mask, | 1542 | gfp_t gfp_mask, |
1475 | unsigned long flags) | 1543 | unsigned long flags) |
@@ -1899,7 +1967,7 @@ again: | |||
1899 | return; | 1967 | return; |
1900 | /* | 1968 | /* |
1901 | * If this memory cgroup is not under account moving, we don't | 1969 | * If this memory cgroup is not under account moving, we don't |
1902 | * need to take move_lock_page_cgroup(). Because we already hold | 1970 | * need to take move_lock_mem_cgroup(). Because we already hold |
1903 | * rcu_read_lock(), any calls to move_account will be delayed until | 1971 | * rcu_read_lock(), any calls to move_account will be delayed until |
1904 | * rcu_read_unlock() if mem_cgroup_stolen() == true. | 1972 | * rcu_read_unlock() if mem_cgroup_stolen() == true. |
1905 | */ | 1973 | */ |
@@ -1921,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) | |||
1921 | /* | 1989 | /* |
1922 | * It's guaranteed that pc->mem_cgroup never changes while | 1990 | * It's guaranteed that pc->mem_cgroup never changes while |
1923 | * lock is held because a routine modifies pc->mem_cgroup | 1991 | * lock is held because a routine modifies pc->mem_cgroup |
1924 | * should take move_lock_page_cgroup(). | 1992 | * should take move_lock_mem_cgroup(). |
1925 | */ | 1993 | */ |
1926 | move_unlock_mem_cgroup(pc->mem_cgroup, flags); | 1994 | move_unlock_mem_cgroup(pc->mem_cgroup, flags); |
1927 | } | 1995 | } |
@@ -2268,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2268 | * We always charge the cgroup the mm_struct belongs to. | 2336 | * We always charge the cgroup the mm_struct belongs to. |
2269 | * The mm_struct's mem_cgroup changes on task migration if the | 2337 | * The mm_struct's mem_cgroup changes on task migration if the |
2270 | * thread group leader migrates. It's possible that mm is not | 2338 | * thread group leader migrates. It's possible that mm is not |
2271 | * set, if so charge the init_mm (happens for pagecache usage). | 2339 | * set, if so charge the root memcg (happens for pagecache usage). |
2272 | */ | 2340 | */ |
2273 | if (!*ptr && !mm) | 2341 | if (!*ptr && !mm) |
2274 | *ptr = root_mem_cgroup; | 2342 | *ptr = root_mem_cgroup; |
@@ -2429,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
2429 | css = css_lookup(&mem_cgroup_subsys, id); | 2497 | css = css_lookup(&mem_cgroup_subsys, id); |
2430 | if (!css) | 2498 | if (!css) |
2431 | return NULL; | 2499 | return NULL; |
2432 | return container_of(css, struct mem_cgroup, css); | 2500 | return mem_cgroup_from_css(css); |
2433 | } | 2501 | } |
2434 | 2502 | ||
2435 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | 2503 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
@@ -2473,11 +2541,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2473 | bool anon; | 2541 | bool anon; |
2474 | 2542 | ||
2475 | lock_page_cgroup(pc); | 2543 | lock_page_cgroup(pc); |
2476 | if (unlikely(PageCgroupUsed(pc))) { | 2544 | VM_BUG_ON(PageCgroupUsed(pc)); |
2477 | unlock_page_cgroup(pc); | ||
2478 | __mem_cgroup_cancel_charge(memcg, nr_pages); | ||
2479 | return; | ||
2480 | } | ||
2481 | /* | 2545 | /* |
2482 | * we don't need page_cgroup_lock about tail pages, becase they are not | 2546 | * we don't need page_cgroup_lock about tail pages, becase they are not |
2483 | * accessed by any other context at this point. | 2547 | * accessed by any other context at this point. |
@@ -2519,7 +2583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2519 | spin_unlock_irq(&zone->lru_lock); | 2583 | spin_unlock_irq(&zone->lru_lock); |
2520 | } | 2584 | } |
2521 | 2585 | ||
2522 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 2586 | if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) |
2523 | anon = true; | 2587 | anon = true; |
2524 | else | 2588 | else |
2525 | anon = false; | 2589 | anon = false; |
@@ -2644,8 +2708,7 @@ out: | |||
2644 | 2708 | ||
2645 | static int mem_cgroup_move_parent(struct page *page, | 2709 | static int mem_cgroup_move_parent(struct page *page, |
2646 | struct page_cgroup *pc, | 2710 | struct page_cgroup *pc, |
2647 | struct mem_cgroup *child, | 2711 | struct mem_cgroup *child) |
2648 | gfp_t gfp_mask) | ||
2649 | { | 2712 | { |
2650 | struct mem_cgroup *parent; | 2713 | struct mem_cgroup *parent; |
2651 | unsigned int nr_pages; | 2714 | unsigned int nr_pages; |
@@ -2728,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
2728 | VM_BUG_ON(page->mapping && !PageAnon(page)); | 2791 | VM_BUG_ON(page->mapping && !PageAnon(page)); |
2729 | VM_BUG_ON(!mm); | 2792 | VM_BUG_ON(!mm); |
2730 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2793 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
2731 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2794 | MEM_CGROUP_CHARGE_TYPE_ANON); |
2732 | } | ||
2733 | |||
2734 | static void | ||
2735 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | ||
2736 | enum charge_type ctype); | ||
2737 | |||
2738 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | ||
2739 | gfp_t gfp_mask) | ||
2740 | { | ||
2741 | struct mem_cgroup *memcg = NULL; | ||
2742 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2743 | int ret; | ||
2744 | |||
2745 | if (mem_cgroup_disabled()) | ||
2746 | return 0; | ||
2747 | if (PageCompound(page)) | ||
2748 | return 0; | ||
2749 | |||
2750 | if (unlikely(!mm)) | ||
2751 | mm = &init_mm; | ||
2752 | if (!page_is_file_cache(page)) | ||
2753 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
2754 | |||
2755 | if (!PageSwapCache(page)) | ||
2756 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); | ||
2757 | else { /* page is swapcache/shmem */ | ||
2758 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); | ||
2759 | if (!ret) | ||
2760 | __mem_cgroup_commit_charge_swapin(page, memcg, type); | ||
2761 | } | ||
2762 | return ret; | ||
2763 | } | 2795 | } |
2764 | 2796 | ||
2765 | /* | 2797 | /* |
@@ -2768,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2768 | * struct page_cgroup is acquired. This refcnt will be consumed by | 2800 | * struct page_cgroup is acquired. This refcnt will be consumed by |
2769 | * "commit()" or removed by "cancel()" | 2801 | * "commit()" or removed by "cancel()" |
2770 | */ | 2802 | */ |
2771 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 2803 | static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
2772 | struct page *page, | 2804 | struct page *page, |
2773 | gfp_t mask, struct mem_cgroup **memcgp) | 2805 | gfp_t mask, |
2806 | struct mem_cgroup **memcgp) | ||
2774 | { | 2807 | { |
2775 | struct mem_cgroup *memcg; | 2808 | struct mem_cgroup *memcg; |
2809 | struct page_cgroup *pc; | ||
2776 | int ret; | 2810 | int ret; |
2777 | 2811 | ||
2778 | *memcgp = NULL; | 2812 | pc = lookup_page_cgroup(page); |
2779 | |||
2780 | if (mem_cgroup_disabled()) | ||
2781 | return 0; | ||
2782 | |||
2783 | if (!do_swap_account) | ||
2784 | goto charge_cur_mm; | ||
2785 | /* | 2813 | /* |
2786 | * A racing thread's fault, or swapoff, may have already updated | 2814 | * Every swap fault against a single page tries to charge the |
2787 | * the pte, and even removed page from swap cache: in those cases | 2815 | * page, bail as early as possible. shmem_unuse() encounters |
2788 | * do_swap_page()'s pte_same() test will fail; but there's also a | 2816 | * already charged pages, too. The USED bit is protected by |
2789 | * KSM case which does need to charge the page. | 2817 | * the page lock, which serializes swap cache removal, which |
2818 | * in turn serializes uncharging. | ||
2790 | */ | 2819 | */ |
2791 | if (!PageSwapCache(page)) | 2820 | if (PageCgroupUsed(pc)) |
2821 | return 0; | ||
2822 | if (!do_swap_account) | ||
2792 | goto charge_cur_mm; | 2823 | goto charge_cur_mm; |
2793 | memcg = try_get_mem_cgroup_from_page(page); | 2824 | memcg = try_get_mem_cgroup_from_page(page); |
2794 | if (!memcg) | 2825 | if (!memcg) |
@@ -2800,14 +2831,44 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2800 | ret = 0; | 2831 | ret = 0; |
2801 | return ret; | 2832 | return ret; |
2802 | charge_cur_mm: | 2833 | charge_cur_mm: |
2803 | if (unlikely(!mm)) | ||
2804 | mm = &init_mm; | ||
2805 | ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); | 2834 | ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); |
2806 | if (ret == -EINTR) | 2835 | if (ret == -EINTR) |
2807 | ret = 0; | 2836 | ret = 0; |
2808 | return ret; | 2837 | return ret; |
2809 | } | 2838 | } |
2810 | 2839 | ||
2840 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, | ||
2841 | gfp_t gfp_mask, struct mem_cgroup **memcgp) | ||
2842 | { | ||
2843 | *memcgp = NULL; | ||
2844 | if (mem_cgroup_disabled()) | ||
2845 | return 0; | ||
2846 | /* | ||
2847 | * A racing thread's fault, or swapoff, may have already | ||
2848 | * updated the pte, and even removed page from swap cache: in | ||
2849 | * those cases unuse_pte()'s pte_same() test will fail; but | ||
2850 | * there's also a KSM case which does need to charge the page. | ||
2851 | */ | ||
2852 | if (!PageSwapCache(page)) { | ||
2853 | int ret; | ||
2854 | |||
2855 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); | ||
2856 | if (ret == -EINTR) | ||
2857 | ret = 0; | ||
2858 | return ret; | ||
2859 | } | ||
2860 | return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); | ||
2861 | } | ||
2862 | |||
2863 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) | ||
2864 | { | ||
2865 | if (mem_cgroup_disabled()) | ||
2866 | return; | ||
2867 | if (!memcg) | ||
2868 | return; | ||
2869 | __mem_cgroup_cancel_charge(memcg, 1); | ||
2870 | } | ||
2871 | |||
2811 | static void | 2872 | static void |
2812 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | 2873 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, |
2813 | enum charge_type ctype) | 2874 | enum charge_type ctype) |
@@ -2842,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page, | |||
2842 | struct mem_cgroup *memcg) | 2903 | struct mem_cgroup *memcg) |
2843 | { | 2904 | { |
2844 | __mem_cgroup_commit_charge_swapin(page, memcg, | 2905 | __mem_cgroup_commit_charge_swapin(page, memcg, |
2845 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2906 | MEM_CGROUP_CHARGE_TYPE_ANON); |
2846 | } | 2907 | } |
2847 | 2908 | ||
2848 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) | 2909 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
2910 | gfp_t gfp_mask) | ||
2849 | { | 2911 | { |
2912 | struct mem_cgroup *memcg = NULL; | ||
2913 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2914 | int ret; | ||
2915 | |||
2850 | if (mem_cgroup_disabled()) | 2916 | if (mem_cgroup_disabled()) |
2851 | return; | 2917 | return 0; |
2852 | if (!memcg) | 2918 | if (PageCompound(page)) |
2853 | return; | 2919 | return 0; |
2854 | __mem_cgroup_cancel_charge(memcg, 1); | 2920 | |
2921 | if (!PageSwapCache(page)) | ||
2922 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); | ||
2923 | else { /* page is swapcache/shmem */ | ||
2924 | ret = __mem_cgroup_try_charge_swapin(mm, page, | ||
2925 | gfp_mask, &memcg); | ||
2926 | if (!ret) | ||
2927 | __mem_cgroup_commit_charge_swapin(page, memcg, type); | ||
2928 | } | ||
2929 | return ret; | ||
2855 | } | 2930 | } |
2856 | 2931 | ||
2857 | static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, | 2932 | static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, |
@@ -2911,7 +2986,8 @@ direct_uncharge: | |||
2911 | * uncharge if !page_mapped(page) | 2986 | * uncharge if !page_mapped(page) |
2912 | */ | 2987 | */ |
2913 | static struct mem_cgroup * | 2988 | static struct mem_cgroup * |
2914 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2989 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, |
2990 | bool end_migration) | ||
2915 | { | 2991 | { |
2916 | struct mem_cgroup *memcg = NULL; | 2992 | struct mem_cgroup *memcg = NULL; |
2917 | unsigned int nr_pages = 1; | 2993 | unsigned int nr_pages = 1; |
@@ -2921,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2921 | if (mem_cgroup_disabled()) | 2997 | if (mem_cgroup_disabled()) |
2922 | return NULL; | 2998 | return NULL; |
2923 | 2999 | ||
2924 | if (PageSwapCache(page)) | 3000 | VM_BUG_ON(PageSwapCache(page)); |
2925 | return NULL; | ||
2926 | 3001 | ||
2927 | if (PageTransHuge(page)) { | 3002 | if (PageTransHuge(page)) { |
2928 | nr_pages <<= compound_order(page); | 3003 | nr_pages <<= compound_order(page); |
@@ -2945,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2945 | anon = PageAnon(page); | 3020 | anon = PageAnon(page); |
2946 | 3021 | ||
2947 | switch (ctype) { | 3022 | switch (ctype) { |
2948 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 3023 | case MEM_CGROUP_CHARGE_TYPE_ANON: |
2949 | /* | 3024 | /* |
2950 | * Generally PageAnon tells if it's the anon statistics to be | 3025 | * Generally PageAnon tells if it's the anon statistics to be |
2951 | * updated; but sometimes e.g. mem_cgroup_uncharge_page() is | 3026 | * updated; but sometimes e.g. mem_cgroup_uncharge_page() is |
@@ -2955,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2955 | /* fallthrough */ | 3030 | /* fallthrough */ |
2956 | case MEM_CGROUP_CHARGE_TYPE_DROP: | 3031 | case MEM_CGROUP_CHARGE_TYPE_DROP: |
2957 | /* See mem_cgroup_prepare_migration() */ | 3032 | /* See mem_cgroup_prepare_migration() */ |
2958 | if (page_mapped(page) || PageCgroupMigration(pc)) | 3033 | if (page_mapped(page)) |
3034 | goto unlock_out; | ||
3035 | /* | ||
3036 | * Pages under migration may not be uncharged. But | ||
3037 | * end_migration() /must/ be the one uncharging the | ||
3038 | * unused post-migration page and so it has to call | ||
3039 | * here with the migration bit still set. See the | ||
3040 | * res_counter handling below. | ||
3041 | */ | ||
3042 | if (!end_migration && PageCgroupMigration(pc)) | ||
2959 | goto unlock_out; | 3043 | goto unlock_out; |
2960 | break; | 3044 | break; |
2961 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: | 3045 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: |
@@ -2989,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2989 | mem_cgroup_swap_statistics(memcg, true); | 3073 | mem_cgroup_swap_statistics(memcg, true); |
2990 | mem_cgroup_get(memcg); | 3074 | mem_cgroup_get(memcg); |
2991 | } | 3075 | } |
2992 | if (!mem_cgroup_is_root(memcg)) | 3076 | /* |
3077 | * Migration does not charge the res_counter for the | ||
3078 | * replacement page, so leave it alone when phasing out the | ||
3079 | * page that is unused after the migration. | ||
3080 | */ | ||
3081 | if (!end_migration && !mem_cgroup_is_root(memcg)) | ||
2993 | mem_cgroup_do_uncharge(memcg, nr_pages, ctype); | 3082 | mem_cgroup_do_uncharge(memcg, nr_pages, ctype); |
2994 | 3083 | ||
2995 | return memcg; | 3084 | return memcg; |
@@ -3005,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page) | |||
3005 | if (page_mapped(page)) | 3094 | if (page_mapped(page)) |
3006 | return; | 3095 | return; |
3007 | VM_BUG_ON(page->mapping && !PageAnon(page)); | 3096 | VM_BUG_ON(page->mapping && !PageAnon(page)); |
3008 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 3097 | if (PageSwapCache(page)) |
3098 | return; | ||
3099 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); | ||
3009 | } | 3100 | } |
3010 | 3101 | ||
3011 | void mem_cgroup_uncharge_cache_page(struct page *page) | 3102 | void mem_cgroup_uncharge_cache_page(struct page *page) |
3012 | { | 3103 | { |
3013 | VM_BUG_ON(page_mapped(page)); | 3104 | VM_BUG_ON(page_mapped(page)); |
3014 | VM_BUG_ON(page->mapping); | 3105 | VM_BUG_ON(page->mapping); |
3015 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 3106 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); |
3016 | } | 3107 | } |
3017 | 3108 | ||
3018 | /* | 3109 | /* |
@@ -3076,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
3076 | if (!swapout) /* this was a swap cache but the swap is unused ! */ | 3167 | if (!swapout) /* this was a swap cache but the swap is unused ! */ |
3077 | ctype = MEM_CGROUP_CHARGE_TYPE_DROP; | 3168 | ctype = MEM_CGROUP_CHARGE_TYPE_DROP; |
3078 | 3169 | ||
3079 | memcg = __mem_cgroup_uncharge_common(page, ctype); | 3170 | memcg = __mem_cgroup_uncharge_common(page, ctype, false); |
3080 | 3171 | ||
3081 | /* | 3172 | /* |
3082 | * record memcg information, if swapout && memcg != NULL, | 3173 | * record memcg information, if swapout && memcg != NULL, |
@@ -3087,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
3087 | } | 3178 | } |
3088 | #endif | 3179 | #endif |
3089 | 3180 | ||
3090 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3181 | #ifdef CONFIG_MEMCG_SWAP |
3091 | /* | 3182 | /* |
3092 | * called from swap_entry_free(). remove record in swap_cgroup and | 3183 | * called from swap_entry_free(). remove record in swap_cgroup and |
3093 | * uncharge "memsw" account. | 3184 | * uncharge "memsw" account. |
@@ -3166,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3166 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | 3257 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old |
3167 | * page belongs to. | 3258 | * page belongs to. |
3168 | */ | 3259 | */ |
3169 | int mem_cgroup_prepare_migration(struct page *page, | 3260 | void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, |
3170 | struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) | 3261 | struct mem_cgroup **memcgp) |
3171 | { | 3262 | { |
3172 | struct mem_cgroup *memcg = NULL; | 3263 | struct mem_cgroup *memcg = NULL; |
3173 | struct page_cgroup *pc; | 3264 | struct page_cgroup *pc; |
3174 | enum charge_type ctype; | 3265 | enum charge_type ctype; |
3175 | int ret = 0; | ||
3176 | 3266 | ||
3177 | *memcgp = NULL; | 3267 | *memcgp = NULL; |
3178 | 3268 | ||
3179 | VM_BUG_ON(PageTransHuge(page)); | 3269 | VM_BUG_ON(PageTransHuge(page)); |
3180 | if (mem_cgroup_disabled()) | 3270 | if (mem_cgroup_disabled()) |
3181 | return 0; | 3271 | return; |
3182 | 3272 | ||
3183 | pc = lookup_page_cgroup(page); | 3273 | pc = lookup_page_cgroup(page); |
3184 | lock_page_cgroup(pc); | 3274 | lock_page_cgroup(pc); |
@@ -3223,24 +3313,9 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3223 | * we return here. | 3313 | * we return here. |
3224 | */ | 3314 | */ |
3225 | if (!memcg) | 3315 | if (!memcg) |
3226 | return 0; | 3316 | return; |
3227 | 3317 | ||
3228 | *memcgp = memcg; | 3318 | *memcgp = memcg; |
3229 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); | ||
3230 | css_put(&memcg->css);/* drop extra refcnt */ | ||
3231 | if (ret) { | ||
3232 | if (PageAnon(page)) { | ||
3233 | lock_page_cgroup(pc); | ||
3234 | ClearPageCgroupMigration(pc); | ||
3235 | unlock_page_cgroup(pc); | ||
3236 | /* | ||
3237 | * The old page may be fully unmapped while we kept it. | ||
3238 | */ | ||
3239 | mem_cgroup_uncharge_page(page); | ||
3240 | } | ||
3241 | /* we'll need to revisit this error code (we have -EINTR) */ | ||
3242 | return -ENOMEM; | ||
3243 | } | ||
3244 | /* | 3319 | /* |
3245 | * We charge new page before it's used/mapped. So, even if unlock_page() | 3320 | * We charge new page before it's used/mapped. So, even if unlock_page() |
3246 | * is called before end_migration, we can catch all events on this new | 3321 | * is called before end_migration, we can catch all events on this new |
@@ -3248,13 +3323,15 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3248 | * mapcount will be finally 0 and we call uncharge in end_migration(). | 3323 | * mapcount will be finally 0 and we call uncharge in end_migration(). |
3249 | */ | 3324 | */ |
3250 | if (PageAnon(page)) | 3325 | if (PageAnon(page)) |
3251 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | 3326 | ctype = MEM_CGROUP_CHARGE_TYPE_ANON; |
3252 | else if (page_is_file_cache(page)) | ||
3253 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
3254 | else | 3327 | else |
3255 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3328 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3329 | /* | ||
3330 | * The page is committed to the memcg, but it's not actually | ||
3331 | * charged to the res_counter since we plan on replacing the | ||
3332 | * old one and only one page is going to be left afterwards. | ||
3333 | */ | ||
3256 | __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); | 3334 | __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); |
3257 | return ret; | ||
3258 | } | 3335 | } |
3259 | 3336 | ||
3260 | /* remove redundant charge if migration failed*/ | 3337 | /* remove redundant charge if migration failed*/ |
@@ -3276,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3276 | used = newpage; | 3353 | used = newpage; |
3277 | unused = oldpage; | 3354 | unused = oldpage; |
3278 | } | 3355 | } |
3356 | anon = PageAnon(used); | ||
3357 | __mem_cgroup_uncharge_common(unused, | ||
3358 | anon ? MEM_CGROUP_CHARGE_TYPE_ANON | ||
3359 | : MEM_CGROUP_CHARGE_TYPE_CACHE, | ||
3360 | true); | ||
3361 | css_put(&memcg->css); | ||
3279 | /* | 3362 | /* |
3280 | * We disallowed uncharge of pages under migration because mapcount | 3363 | * We disallowed uncharge of pages under migration because mapcount |
3281 | * of the page goes down to zero, temporarly. | 3364 | * of the page goes down to zero, temporarly. |
@@ -3285,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3285 | lock_page_cgroup(pc); | 3368 | lock_page_cgroup(pc); |
3286 | ClearPageCgroupMigration(pc); | 3369 | ClearPageCgroupMigration(pc); |
3287 | unlock_page_cgroup(pc); | 3370 | unlock_page_cgroup(pc); |
3288 | anon = PageAnon(used); | ||
3289 | __mem_cgroup_uncharge_common(unused, | ||
3290 | anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED | ||
3291 | : MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
3292 | 3371 | ||
3293 | /* | 3372 | /* |
3294 | * If a page is a file cache, radix-tree replacement is very atomic | 3373 | * If a page is a file cache, radix-tree replacement is very atomic |
@@ -3340,10 +3419,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
3340 | */ | 3419 | */ |
3341 | if (!memcg) | 3420 | if (!memcg) |
3342 | return; | 3421 | return; |
3343 | |||
3344 | if (PageSwapBacked(oldpage)) | ||
3345 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
3346 | |||
3347 | /* | 3422 | /* |
3348 | * Even if newpage->mapping was NULL before starting replacement, | 3423 | * Even if newpage->mapping was NULL before starting replacement, |
3349 | * the newpage may be on LRU(or pagevec for LRU) already. We lock | 3424 | * the newpage may be on LRU(or pagevec for LRU) already. We lock |
@@ -3418,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3418 | /* | 3493 | /* |
3419 | * Rather than hide all in some function, I do this in | 3494 | * Rather than hide all in some function, I do this in |
3420 | * open coded manner. You see what this really does. | 3495 | * open coded manner. You see what this really does. |
3421 | * We have to guarantee memcg->res.limit < memcg->memsw.limit. | 3496 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. |
3422 | */ | 3497 | */ |
3423 | mutex_lock(&set_limit_mutex); | 3498 | mutex_lock(&set_limit_mutex); |
3424 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3499 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); |
@@ -3479,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3479 | /* | 3554 | /* |
3480 | * Rather than hide all in some function, I do this in | 3555 | * Rather than hide all in some function, I do this in |
3481 | * open coded manner. You see what this really does. | 3556 | * open coded manner. You see what this really does. |
3482 | * We have to guarantee memcg->res.limit < memcg->memsw.limit. | 3557 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. |
3483 | */ | 3558 | */ |
3484 | mutex_lock(&set_limit_mutex); | 3559 | mutex_lock(&set_limit_mutex); |
3485 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3560 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); |
@@ -3611,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3611 | } | 3686 | } |
3612 | 3687 | ||
3613 | /* | 3688 | /* |
3614 | * This routine traverse page_cgroup in given list and drop them all. | 3689 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't |
3615 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 3690 | * reclaim the pages page themselves - it just removes the page_cgroups. |
3691 | * Returns true if some page_cgroups were not freed, indicating that the caller | ||
3692 | * must retry this operation. | ||
3616 | */ | 3693 | */ |
3617 | static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | 3694 | static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, |
3618 | int node, int zid, enum lru_list lru) | 3695 | int node, int zid, enum lru_list lru) |
3619 | { | 3696 | { |
3620 | struct mem_cgroup_per_zone *mz; | 3697 | struct mem_cgroup_per_zone *mz; |
@@ -3622,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3622 | struct list_head *list; | 3699 | struct list_head *list; |
3623 | struct page *busy; | 3700 | struct page *busy; |
3624 | struct zone *zone; | 3701 | struct zone *zone; |
3625 | int ret = 0; | ||
3626 | 3702 | ||
3627 | zone = &NODE_DATA(node)->node_zones[zid]; | 3703 | zone = &NODE_DATA(node)->node_zones[zid]; |
3628 | mz = mem_cgroup_zoneinfo(memcg, node, zid); | 3704 | mz = mem_cgroup_zoneinfo(memcg, node, zid); |
@@ -3636,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3636 | struct page_cgroup *pc; | 3712 | struct page_cgroup *pc; |
3637 | struct page *page; | 3713 | struct page *page; |
3638 | 3714 | ||
3639 | ret = 0; | ||
3640 | spin_lock_irqsave(&zone->lru_lock, flags); | 3715 | spin_lock_irqsave(&zone->lru_lock, flags); |
3641 | if (list_empty(list)) { | 3716 | if (list_empty(list)) { |
3642 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3717 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
@@ -3653,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3653 | 3728 | ||
3654 | pc = lookup_page_cgroup(page); | 3729 | pc = lookup_page_cgroup(page); |
3655 | 3730 | ||
3656 | ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); | 3731 | if (mem_cgroup_move_parent(page, pc, memcg)) { |
3657 | if (ret == -ENOMEM || ret == -EINTR) | ||
3658 | break; | ||
3659 | |||
3660 | if (ret == -EBUSY || ret == -EINVAL) { | ||
3661 | /* found lock contention or "pc" is obsolete. */ | 3732 | /* found lock contention or "pc" is obsolete. */ |
3662 | busy = page; | 3733 | busy = page; |
3663 | cond_resched(); | 3734 | cond_resched(); |
3664 | } else | 3735 | } else |
3665 | busy = NULL; | 3736 | busy = NULL; |
3666 | } | 3737 | } |
3667 | 3738 | return !list_empty(list); | |
3668 | if (!ret && !list_empty(list)) | ||
3669 | return -EBUSY; | ||
3670 | return ret; | ||
3671 | } | 3739 | } |
3672 | 3740 | ||
3673 | /* | 3741 | /* |
@@ -3692,9 +3760,6 @@ move_account: | |||
3692 | ret = -EBUSY; | 3760 | ret = -EBUSY; |
3693 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | 3761 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) |
3694 | goto out; | 3762 | goto out; |
3695 | ret = -EINTR; | ||
3696 | if (signal_pending(current)) | ||
3697 | goto out; | ||
3698 | /* This is for making all *used* pages to be on LRU. */ | 3763 | /* This is for making all *used* pages to be on LRU. */ |
3699 | lru_add_drain_all(); | 3764 | lru_add_drain_all(); |
3700 | drain_all_stock_sync(memcg); | 3765 | drain_all_stock_sync(memcg); |
@@ -3715,9 +3780,6 @@ move_account: | |||
3715 | } | 3780 | } |
3716 | mem_cgroup_end_move(memcg); | 3781 | mem_cgroup_end_move(memcg); |
3717 | memcg_oom_recover(memcg); | 3782 | memcg_oom_recover(memcg); |
3718 | /* it seems parent cgroup doesn't have enough mem */ | ||
3719 | if (ret == -ENOMEM) | ||
3720 | goto try_to_free; | ||
3721 | cond_resched(); | 3783 | cond_resched(); |
3722 | /* "ret" should also be checked to ensure all lists are empty. */ | 3784 | /* "ret" should also be checked to ensure all lists are empty. */ |
3723 | } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); | 3785 | } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); |
@@ -3779,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3779 | parent_memcg = mem_cgroup_from_cont(parent); | 3841 | parent_memcg = mem_cgroup_from_cont(parent); |
3780 | 3842 | ||
3781 | cgroup_lock(); | 3843 | cgroup_lock(); |
3844 | |||
3845 | if (memcg->use_hierarchy == val) | ||
3846 | goto out; | ||
3847 | |||
3782 | /* | 3848 | /* |
3783 | * If parent's use_hierarchy is set, we can't make any modifications | 3849 | * If parent's use_hierarchy is set, we can't make any modifications |
3784 | * in the child subtrees. If it is unset, then the change can | 3850 | * in the child subtrees. If it is unset, then the change can |
@@ -3795,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3795 | retval = -EBUSY; | 3861 | retval = -EBUSY; |
3796 | } else | 3862 | } else |
3797 | retval = -EINVAL; | 3863 | retval = -EINVAL; |
3864 | |||
3865 | out: | ||
3798 | cgroup_unlock(); | 3866 | cgroup_unlock(); |
3799 | 3867 | ||
3800 | return retval; | 3868 | return retval; |
@@ -3831,7 +3899,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
3831 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); | 3899 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); |
3832 | 3900 | ||
3833 | if (swap) | 3901 | if (swap) |
3834 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); | 3902 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); |
3835 | 3903 | ||
3836 | return val << PAGE_SHIFT; | 3904 | return val << PAGE_SHIFT; |
3837 | } | 3905 | } |
@@ -4015,7 +4083,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | |||
4015 | #endif | 4083 | #endif |
4016 | 4084 | ||
4017 | #ifdef CONFIG_NUMA | 4085 | #ifdef CONFIG_NUMA |
4018 | static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft, | 4086 | static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, |
4019 | struct seq_file *m) | 4087 | struct seq_file *m) |
4020 | { | 4088 | { |
4021 | int nid; | 4089 | int nid; |
@@ -4074,7 +4142,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) | |||
4074 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 4142 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); |
4075 | } | 4143 | } |
4076 | 4144 | ||
4077 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 4145 | static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, |
4078 | struct seq_file *m) | 4146 | struct seq_file *m) |
4079 | { | 4147 | { |
4080 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 4148 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
@@ -4082,7 +4150,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4082 | unsigned int i; | 4150 | unsigned int i; |
4083 | 4151 | ||
4084 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 4152 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
4085 | if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) | 4153 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
4086 | continue; | 4154 | continue; |
4087 | seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], | 4155 | seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], |
4088 | mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); | 4156 | mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); |
@@ -4109,7 +4177,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4109 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 4177 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
4110 | long long val = 0; | 4178 | long long val = 0; |
4111 | 4179 | ||
4112 | if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) | 4180 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
4113 | continue; | 4181 | continue; |
4114 | for_each_mem_cgroup_tree(mi, memcg) | 4182 | for_each_mem_cgroup_tree(mi, memcg) |
4115 | val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; | 4183 | val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; |
@@ -4533,7 +4601,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
4533 | return 0; | 4601 | return 0; |
4534 | } | 4602 | } |
4535 | 4603 | ||
4536 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM | 4604 | #ifdef CONFIG_MEMCG_KMEM |
4537 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 4605 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
4538 | { | 4606 | { |
4539 | return mem_cgroup_sockets_init(memcg, ss); | 4607 | return mem_cgroup_sockets_init(memcg, ss); |
@@ -4588,7 +4656,7 @@ static struct cftype mem_cgroup_files[] = { | |||
4588 | }, | 4656 | }, |
4589 | { | 4657 | { |
4590 | .name = "stat", | 4658 | .name = "stat", |
4591 | .read_seq_string = mem_control_stat_show, | 4659 | .read_seq_string = memcg_stat_show, |
4592 | }, | 4660 | }, |
4593 | { | 4661 | { |
4594 | .name = "force_empty", | 4662 | .name = "force_empty", |
@@ -4620,10 +4688,10 @@ static struct cftype mem_cgroup_files[] = { | |||
4620 | #ifdef CONFIG_NUMA | 4688 | #ifdef CONFIG_NUMA |
4621 | { | 4689 | { |
4622 | .name = "numa_stat", | 4690 | .name = "numa_stat", |
4623 | .read_seq_string = mem_control_numa_stat_show, | 4691 | .read_seq_string = memcg_numa_stat_show, |
4624 | }, | 4692 | }, |
4625 | #endif | 4693 | #endif |
4626 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4694 | #ifdef CONFIG_MEMCG_SWAP |
4627 | { | 4695 | { |
4628 | .name = "memsw.usage_in_bytes", | 4696 | .name = "memsw.usage_in_bytes", |
4629 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 4697 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
@@ -4810,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
4810 | } | 4878 | } |
4811 | EXPORT_SYMBOL(parent_mem_cgroup); | 4879 | EXPORT_SYMBOL(parent_mem_cgroup); |
4812 | 4880 | ||
4813 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 4881 | #ifdef CONFIG_MEMCG_SWAP |
4814 | static void __init enable_swap_cgroup(void) | 4882 | static void __init enable_swap_cgroup(void) |
4815 | { | 4883 | { |
4816 | if (!mem_cgroup_disabled() && really_do_swap_account) | 4884 | if (!mem_cgroup_disabled() && really_do_swap_account) |
@@ -5541,7 +5609,7 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
5541 | .__DEPRECATED_clear_css_refs = true, | 5609 | .__DEPRECATED_clear_css_refs = true, |
5542 | }; | 5610 | }; |
5543 | 5611 | ||
5544 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 5612 | #ifdef CONFIG_MEMCG_SWAP |
5545 | static int __init enable_swap_account(char *s) | 5613 | static int __init enable_swap_account(char *s) |
5546 | { | 5614 | { |
5547 | /* consider enabled if no parameter or 1 is given */ | 5615 | /* consider enabled if no parameter or 1 is given */ |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6de0d613bbe6..a6e2141a6610 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p) | |||
128 | * can only guarantee that the page either belongs to the memcg tasks, or is | 128 | * can only guarantee that the page either belongs to the memcg tasks, or is |
129 | * a freed page. | 129 | * a freed page. |
130 | */ | 130 | */ |
131 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 131 | #ifdef CONFIG_MEMCG_SWAP |
132 | u64 hwpoison_filter_memcg; | 132 | u64 hwpoison_filter_memcg; |
133 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); | 133 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); |
134 | static int hwpoison_filter_task(struct page *p) | 134 | static int hwpoison_filter_task(struct page *p) |
@@ -1416,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1416 | int ret; | 1416 | int ret; |
1417 | unsigned long pfn = page_to_pfn(page); | 1417 | unsigned long pfn = page_to_pfn(page); |
1418 | struct page *hpage = compound_head(page); | 1418 | struct page *hpage = compound_head(page); |
1419 | LIST_HEAD(pagelist); | ||
1420 | 1419 | ||
1421 | ret = get_any_page(page, pfn, flags); | 1420 | ret = get_any_page(page, pfn, flags); |
1422 | if (ret < 0) | 1421 | if (ret < 0) |
@@ -1431,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1431 | } | 1430 | } |
1432 | 1431 | ||
1433 | /* Keep page count to indicate a given hugepage is isolated. */ | 1432 | /* Keep page count to indicate a given hugepage is isolated. */ |
1434 | 1433 | ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, | |
1435 | list_add(&hpage->lru, &pagelist); | ||
1436 | ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false, | ||
1437 | MIGRATE_SYNC); | 1434 | MIGRATE_SYNC); |
1435 | put_page(hpage); | ||
1438 | if (ret) { | 1436 | if (ret) { |
1439 | struct page *page1, *page2; | ||
1440 | list_for_each_entry_safe(page1, page2, &pagelist, lru) | ||
1441 | put_page(page1); | ||
1442 | |||
1443 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1437 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1444 | pfn, ret, page->flags); | 1438 | pfn, ret, page->flags); |
1445 | if (ret > 0) | ||
1446 | ret = -EIO; | ||
1447 | return ret; | 1439 | return ret; |
1448 | } | 1440 | } |
1449 | done: | 1441 | done: |
1450 | if (!PageHWPoison(hpage)) | 1442 | if (!PageHWPoison(hpage)) |
1451 | atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); | 1443 | atomic_long_add(1 << compound_trans_order(hpage), |
1444 | &mce_bad_pages); | ||
1452 | set_page_hwpoison_huge_page(hpage); | 1445 | set_page_hwpoison_huge_page(hpage); |
1453 | dequeue_hwpoisoned_huge_page(hpage); | 1446 | dequeue_hwpoisoned_huge_page(hpage); |
1454 | /* keep elevated page count for bad page */ | 1447 | /* keep elevated page count for bad page */ |
diff --git a/mm/memory.c b/mm/memory.c index 91f69459d3e8..482f089765ff 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1343,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1343 | * Since no pte has actually been setup, it is | 1343 | * Since no pte has actually been setup, it is |
1344 | * safe to do nothing in this case. | 1344 | * safe to do nothing in this case. |
1345 | */ | 1345 | */ |
1346 | if (vma->vm_file) | 1346 | if (vma->vm_file) { |
1347 | unmap_hugepage_range(vma, start, end, NULL); | 1347 | mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); |
1348 | __unmap_hugepage_range_final(tlb, vma, start, end, NULL); | ||
1349 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | ||
1350 | } | ||
1348 | } else | 1351 | } else |
1349 | unmap_page_range(tlb, vma, start, end, details); | 1352 | unmap_page_range(tlb, vma, start, end, details); |
1350 | } | 1353 | } |
@@ -3938,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip) | |||
3938 | free_page((unsigned long)buf); | 3941 | free_page((unsigned long)buf); |
3939 | } | 3942 | } |
3940 | } | 3943 | } |
3941 | up_read(¤t->mm->mmap_sem); | 3944 | up_read(&mm->mmap_sem); |
3942 | } | 3945 | } |
3943 | 3946 | ||
3944 | #ifdef CONFIG_PROVE_LOCKING | 3947 | #ifdef CONFIG_PROVE_LOCKING |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 427bb291dd0f..3ad25f9d1fc1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -512,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
512 | 512 | ||
513 | zone->present_pages += onlined_pages; | 513 | zone->present_pages += onlined_pages; |
514 | zone->zone_pgdat->node_present_pages += onlined_pages; | 514 | zone->zone_pgdat->node_present_pages += onlined_pages; |
515 | if (need_zonelists_rebuild) | 515 | if (onlined_pages) { |
516 | build_all_zonelists(zone); | 516 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
517 | else | 517 | if (need_zonelists_rebuild) |
518 | zone_pcp_update(zone); | 518 | build_all_zonelists(NULL, zone); |
519 | else | ||
520 | zone_pcp_update(zone); | ||
521 | } | ||
519 | 522 | ||
520 | mutex_unlock(&zonelists_mutex); | 523 | mutex_unlock(&zonelists_mutex); |
521 | 524 | ||
522 | init_per_zone_wmark_min(); | 525 | init_per_zone_wmark_min(); |
523 | 526 | ||
524 | if (onlined_pages) { | 527 | if (onlined_pages) |
525 | kswapd_run(zone_to_nid(zone)); | 528 | kswapd_run(zone_to_nid(zone)); |
526 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | ||
527 | } | ||
528 | 529 | ||
529 | vm_total_pages = nr_free_pagecache_pages(); | 530 | vm_total_pages = nr_free_pagecache_pages(); |
530 | 531 | ||
@@ -562,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
562 | * to access not-initialized zonelist, build here. | 563 | * to access not-initialized zonelist, build here. |
563 | */ | 564 | */ |
564 | mutex_lock(&zonelists_mutex); | 565 | mutex_lock(&zonelists_mutex); |
565 | build_all_zonelists(NULL); | 566 | build_all_zonelists(pgdat, NULL); |
566 | mutex_unlock(&zonelists_mutex); | 567 | mutex_unlock(&zonelists_mutex); |
567 | 568 | ||
568 | return pgdat; | 569 | return pgdat; |
@@ -965,6 +966,9 @@ repeat: | |||
965 | 966 | ||
966 | init_per_zone_wmark_min(); | 967 | init_per_zone_wmark_min(); |
967 | 968 | ||
969 | if (!populated_zone(zone)) | ||
970 | zone_pcp_reset(zone); | ||
971 | |||
968 | if (!node_present_pages(node)) { | 972 | if (!node_present_pages(node)) { |
969 | node_clear_state(node, N_HIGH_MEMORY); | 973 | node_clear_state(node, N_HIGH_MEMORY); |
970 | kswapd_stop(node); | 974 | kswapd_stop(node); |
diff --git a/mm/migrate.c b/mm/migrate.c index be26d5cbe56b..77ed2d773705 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/memcontrol.h> | 33 | #include <linux/memcontrol.h> |
34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
35 | #include <linux/hugetlb.h> | 35 | #include <linux/hugetlb.h> |
36 | #include <linux/hugetlb_cgroup.h> | ||
36 | #include <linux/gfp.h> | 37 | #include <linux/gfp.h> |
37 | 38 | ||
38 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
@@ -682,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
682 | { | 683 | { |
683 | int rc = -EAGAIN; | 684 | int rc = -EAGAIN; |
684 | int remap_swapcache = 1; | 685 | int remap_swapcache = 1; |
685 | int charge = 0; | ||
686 | struct mem_cgroup *mem; | 686 | struct mem_cgroup *mem; |
687 | struct anon_vma *anon_vma = NULL; | 687 | struct anon_vma *anon_vma = NULL; |
688 | 688 | ||
@@ -724,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
724 | } | 724 | } |
725 | 725 | ||
726 | /* charge against new page */ | 726 | /* charge against new page */ |
727 | charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); | 727 | mem_cgroup_prepare_migration(page, newpage, &mem); |
728 | if (charge == -ENOMEM) { | ||
729 | rc = -ENOMEM; | ||
730 | goto unlock; | ||
731 | } | ||
732 | BUG_ON(charge); | ||
733 | 728 | ||
734 | if (PageWriteback(page)) { | 729 | if (PageWriteback(page)) { |
735 | /* | 730 | /* |
@@ -819,8 +814,7 @@ skip_unmap: | |||
819 | put_anon_vma(anon_vma); | 814 | put_anon_vma(anon_vma); |
820 | 815 | ||
821 | uncharge: | 816 | uncharge: |
822 | if (!charge) | 817 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); |
823 | mem_cgroup_end_migration(mem, page, newpage, rc == 0); | ||
824 | unlock: | 818 | unlock: |
825 | unlock_page(page); | 819 | unlock_page(page); |
826 | out: | 820 | out: |
@@ -931,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
931 | 925 | ||
932 | if (anon_vma) | 926 | if (anon_vma) |
933 | put_anon_vma(anon_vma); | 927 | put_anon_vma(anon_vma); |
934 | unlock_page(hpage); | ||
935 | 928 | ||
936 | out: | 929 | if (!rc) |
937 | if (rc != -EAGAIN) { | 930 | hugetlb_cgroup_migrate(hpage, new_hpage); |
938 | list_del(&hpage->lru); | ||
939 | put_page(hpage); | ||
940 | } | ||
941 | 931 | ||
932 | unlock_page(hpage); | ||
933 | out: | ||
942 | put_page(new_hpage); | 934 | put_page(new_hpage); |
943 | |||
944 | if (result) { | 935 | if (result) { |
945 | if (rc) | 936 | if (rc) |
946 | *result = rc; | 937 | *result = rc; |
@@ -1016,48 +1007,32 @@ out: | |||
1016 | return nr_failed + retry; | 1007 | return nr_failed + retry; |
1017 | } | 1008 | } |
1018 | 1009 | ||
1019 | int migrate_huge_pages(struct list_head *from, | 1010 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, |
1020 | new_page_t get_new_page, unsigned long private, bool offlining, | 1011 | unsigned long private, bool offlining, |
1021 | enum migrate_mode mode) | 1012 | enum migrate_mode mode) |
1022 | { | 1013 | { |
1023 | int retry = 1; | 1014 | int pass, rc; |
1024 | int nr_failed = 0; | 1015 | |
1025 | int pass = 0; | 1016 | for (pass = 0; pass < 10; pass++) { |
1026 | struct page *page; | 1017 | rc = unmap_and_move_huge_page(get_new_page, |
1027 | struct page *page2; | 1018 | private, hpage, pass > 2, offlining, |
1028 | int rc; | 1019 | mode); |
1029 | 1020 | switch (rc) { | |
1030 | for (pass = 0; pass < 10 && retry; pass++) { | 1021 | case -ENOMEM: |
1031 | retry = 0; | 1022 | goto out; |
1032 | 1023 | case -EAGAIN: | |
1033 | list_for_each_entry_safe(page, page2, from, lru) { | 1024 | /* try again */ |
1034 | cond_resched(); | 1025 | cond_resched(); |
1035 | 1026 | break; | |
1036 | rc = unmap_and_move_huge_page(get_new_page, | 1027 | case 0: |
1037 | private, page, pass > 2, offlining, | 1028 | goto out; |
1038 | mode); | 1029 | default: |
1039 | 1030 | rc = -EIO; | |
1040 | switch(rc) { | 1031 | goto out; |
1041 | case -ENOMEM: | ||
1042 | goto out; | ||
1043 | case -EAGAIN: | ||
1044 | retry++; | ||
1045 | break; | ||
1046 | case 0: | ||
1047 | break; | ||
1048 | default: | ||
1049 | /* Permanent failure */ | ||
1050 | nr_failed++; | ||
1051 | break; | ||
1052 | } | ||
1053 | } | 1032 | } |
1054 | } | 1033 | } |
1055 | rc = 0; | ||
1056 | out: | 1034 | out: |
1057 | if (rc) | 1035 | return rc; |
1058 | return rc; | ||
1059 | |||
1060 | return nr_failed + retry; | ||
1061 | } | 1036 | } |
1062 | 1037 | ||
1063 | #ifdef CONFIG_NUMA | 1038 | #ifdef CONFIG_NUMA |
@@ -943,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
943 | const unsigned long stack_flags | 943 | const unsigned long stack_flags |
944 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); | 944 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); |
945 | 945 | ||
946 | mm->total_vm += pages; | ||
947 | |||
946 | if (file) { | 948 | if (file) { |
947 | mm->shared_vm += pages; | 949 | mm->shared_vm += pages; |
948 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) | 950 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) |
@@ -1347,7 +1349,6 @@ munmap_back: | |||
1347 | out: | 1349 | out: |
1348 | perf_event_mmap(vma); | 1350 | perf_event_mmap(vma); |
1349 | 1351 | ||
1350 | mm->total_vm += len >> PAGE_SHIFT; | ||
1351 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1352 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1352 | if (vm_flags & VM_LOCKED) { | 1353 | if (vm_flags & VM_LOCKED) { |
1353 | if (!mlock_vma_pages_range(vma, addr, addr + len)) | 1354 | if (!mlock_vma_pages_range(vma, addr, addr + len)) |
@@ -1707,7 +1708,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
1707 | return -ENOMEM; | 1708 | return -ENOMEM; |
1708 | 1709 | ||
1709 | /* Ok, everything looks good - let it rip */ | 1710 | /* Ok, everything looks good - let it rip */ |
1710 | mm->total_vm += grow; | ||
1711 | if (vma->vm_flags & VM_LOCKED) | 1711 | if (vma->vm_flags & VM_LOCKED) |
1712 | mm->locked_vm += grow; | 1712 | mm->locked_vm += grow; |
1713 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); | 1713 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); |
@@ -1889,7 +1889,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) | |||
1889 | 1889 | ||
1890 | if (vma->vm_flags & VM_ACCOUNT) | 1890 | if (vma->vm_flags & VM_ACCOUNT) |
1891 | nr_accounted += nrpages; | 1891 | nr_accounted += nrpages; |
1892 | mm->total_vm -= nrpages; | ||
1893 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); | 1892 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); |
1894 | vma = remove_vma(vma); | 1893 | vma = remove_vma(vma); |
1895 | } while (vma); | 1894 | } while (vma); |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9a611d3a1848..862b60822d9f 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -33,6 +33,24 @@ | |||
33 | void __mmu_notifier_release(struct mm_struct *mm) | 33 | void __mmu_notifier_release(struct mm_struct *mm) |
34 | { | 34 | { |
35 | struct mmu_notifier *mn; | 35 | struct mmu_notifier *mn; |
36 | struct hlist_node *n; | ||
37 | |||
38 | /* | ||
39 | * RCU here will block mmu_notifier_unregister until | ||
40 | * ->release returns. | ||
41 | */ | ||
42 | rcu_read_lock(); | ||
43 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) | ||
44 | /* | ||
45 | * if ->release runs before mmu_notifier_unregister it | ||
46 | * must be handled as it's the only way for the driver | ||
47 | * to flush all existing sptes and stop the driver | ||
48 | * from establishing any more sptes before all the | ||
49 | * pages in the mm are freed. | ||
50 | */ | ||
51 | if (mn->ops->release) | ||
52 | mn->ops->release(mn, mm); | ||
53 | rcu_read_unlock(); | ||
36 | 54 | ||
37 | spin_lock(&mm->mmu_notifier_mm->lock); | 55 | spin_lock(&mm->mmu_notifier_mm->lock); |
38 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 56 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
46 | * mmu_notifier_unregister to return. | 64 | * mmu_notifier_unregister to return. |
47 | */ | 65 | */ |
48 | hlist_del_init_rcu(&mn->hlist); | 66 | hlist_del_init_rcu(&mn->hlist); |
49 | /* | ||
50 | * RCU here will block mmu_notifier_unregister until | ||
51 | * ->release returns. | ||
52 | */ | ||
53 | rcu_read_lock(); | ||
54 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
55 | /* | ||
56 | * if ->release runs before mmu_notifier_unregister it | ||
57 | * must be handled as it's the only way for the driver | ||
58 | * to flush all existing sptes and stop the driver | ||
59 | * from establishing any more sptes before all the | ||
60 | * pages in the mm are freed. | ||
61 | */ | ||
62 | if (mn->ops->release) | ||
63 | mn->ops->release(mn, mm); | ||
64 | rcu_read_unlock(); | ||
65 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
66 | } | 67 | } |
67 | spin_unlock(&mm->mmu_notifier_mm->lock); | 68 | spin_unlock(&mm->mmu_notifier_mm->lock); |
68 | 69 | ||
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
284 | { | 285 | { |
285 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 286 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
286 | 287 | ||
287 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
288 | if (!hlist_unhashed(&mn->hlist)) { | 288 | if (!hlist_unhashed(&mn->hlist)) { |
289 | hlist_del_rcu(&mn->hlist); | ||
290 | |||
291 | /* | 289 | /* |
292 | * RCU here will force exit_mmap to wait ->release to finish | 290 | * RCU here will force exit_mmap to wait ->release to finish |
293 | * before freeing the pages. | 291 | * before freeing the pages. |
294 | */ | 292 | */ |
295 | rcu_read_lock(); | 293 | rcu_read_lock(); |
296 | spin_unlock(&mm->mmu_notifier_mm->lock); | 294 | |
297 | /* | 295 | /* |
298 | * exit_mmap will block in mmu_notifier_release to | 296 | * exit_mmap will block in mmu_notifier_release to |
299 | * guarantee ->release is called before freeing the | 297 | * guarantee ->release is called before freeing the |
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
302 | if (mn->ops->release) | 300 | if (mn->ops->release) |
303 | mn->ops->release(mn, mm); | 301 | mn->ops->release(mn, mm); |
304 | rcu_read_unlock(); | 302 | rcu_read_unlock(); |
305 | } else | 303 | |
304 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
305 | hlist_del_rcu(&mn->hlist); | ||
306 | spin_unlock(&mm->mmu_notifier_mm->lock); | 306 | spin_unlock(&mm->mmu_notifier_mm->lock); |
307 | } | ||
307 | 308 | ||
308 | /* | 309 | /* |
309 | * Wait any running method to finish, of course including | 310 | * Wait any running method to finish, of course including |
diff --git a/mm/mmzone.c b/mm/mmzone.c index 6830eab5bf09..3cef80f6ac79 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -96,7 +96,7 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone) | |||
96 | for_each_lru(lru) | 96 | for_each_lru(lru) |
97 | INIT_LIST_HEAD(&lruvec->lists[lru]); | 97 | INIT_LIST_HEAD(&lruvec->lists[lru]); |
98 | 98 | ||
99 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 99 | #ifdef CONFIG_MEMCG |
100 | lruvec->zone = zone; | 100 | lruvec->zone = zone; |
101 | #endif | 101 | #endif |
102 | } | 102 | } |
diff --git a/mm/mremap.c b/mm/mremap.c index 21fed202ddad..cc06d0e48d05 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
260 | * If this were a serious issue, we'd add a flag to do_munmap(). | 260 | * If this were a serious issue, we'd add a flag to do_munmap(). |
261 | */ | 261 | */ |
262 | hiwater_vm = mm->hiwater_vm; | 262 | hiwater_vm = mm->hiwater_vm; |
263 | mm->total_vm += new_len >> PAGE_SHIFT; | ||
264 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); | 263 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); |
265 | 264 | ||
266 | if (do_munmap(mm, old_addr, old_len) < 0) { | 265 | if (do_munmap(mm, old_addr, old_len) < 0) { |
@@ -497,7 +496,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
497 | goto out; | 496 | goto out; |
498 | } | 497 | } |
499 | 498 | ||
500 | mm->total_vm += pages; | ||
501 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); | 499 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
502 | if (vma->vm_flags & VM_LOCKED) { | 500 | if (vma->vm_flags & VM_LOCKED) { |
503 | mm->locked_vm += pages; | 501 | mm->locked_vm += pages; |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ac300c99baf6..198600861638 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -288,76 +288,93 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
288 | } | 288 | } |
289 | #endif | 289 | #endif |
290 | 290 | ||
291 | enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | ||
292 | unsigned long totalpages, const nodemask_t *nodemask, | ||
293 | bool force_kill) | ||
294 | { | ||
295 | if (task->exit_state) | ||
296 | return OOM_SCAN_CONTINUE; | ||
297 | if (oom_unkillable_task(task, NULL, nodemask)) | ||
298 | return OOM_SCAN_CONTINUE; | ||
299 | |||
300 | /* | ||
301 | * This task already has access to memory reserves and is being killed. | ||
302 | * Don't allow any other task to have access to the reserves. | ||
303 | */ | ||
304 | if (test_tsk_thread_flag(task, TIF_MEMDIE)) { | ||
305 | if (unlikely(frozen(task))) | ||
306 | __thaw_task(task); | ||
307 | if (!force_kill) | ||
308 | return OOM_SCAN_ABORT; | ||
309 | } | ||
310 | if (!task->mm) | ||
311 | return OOM_SCAN_CONTINUE; | ||
312 | |||
313 | if (task->flags & PF_EXITING) { | ||
314 | /* | ||
315 | * If task is current and is in the process of releasing memory, | ||
316 | * allow the "kill" to set TIF_MEMDIE, which will allow it to | ||
317 | * access memory reserves. Otherwise, it may stall forever. | ||
318 | * | ||
319 | * The iteration isn't broken here, however, in case other | ||
320 | * threads are found to have already been oom killed. | ||
321 | */ | ||
322 | if (task == current) | ||
323 | return OOM_SCAN_SELECT; | ||
324 | else if (!force_kill) { | ||
325 | /* | ||
326 | * If this task is not being ptraced on exit, then wait | ||
327 | * for it to finish before killing some other task | ||
328 | * unnecessarily. | ||
329 | */ | ||
330 | if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) | ||
331 | return OOM_SCAN_ABORT; | ||
332 | } | ||
333 | } | ||
334 | return OOM_SCAN_OK; | ||
335 | } | ||
336 | |||
291 | /* | 337 | /* |
292 | * Simple selection loop. We chose the process with the highest | 338 | * Simple selection loop. We chose the process with the highest |
293 | * number of 'points'. We expect the caller will lock the tasklist. | 339 | * number of 'points'. |
294 | * | 340 | * |
295 | * (not docbooked, we don't want this one cluttering up the manual) | 341 | * (not docbooked, we don't want this one cluttering up the manual) |
296 | */ | 342 | */ |
297 | static struct task_struct *select_bad_process(unsigned int *ppoints, | 343 | static struct task_struct *select_bad_process(unsigned int *ppoints, |
298 | unsigned long totalpages, struct mem_cgroup *memcg, | 344 | unsigned long totalpages, const nodemask_t *nodemask, |
299 | const nodemask_t *nodemask, bool force_kill) | 345 | bool force_kill) |
300 | { | 346 | { |
301 | struct task_struct *g, *p; | 347 | struct task_struct *g, *p; |
302 | struct task_struct *chosen = NULL; | 348 | struct task_struct *chosen = NULL; |
303 | unsigned long chosen_points = 0; | 349 | unsigned long chosen_points = 0; |
304 | 350 | ||
351 | rcu_read_lock(); | ||
305 | do_each_thread(g, p) { | 352 | do_each_thread(g, p) { |
306 | unsigned int points; | 353 | unsigned int points; |
307 | 354 | ||
308 | if (p->exit_state) | 355 | switch (oom_scan_process_thread(p, totalpages, nodemask, |
309 | continue; | 356 | force_kill)) { |
310 | if (oom_unkillable_task(p, memcg, nodemask)) | 357 | case OOM_SCAN_SELECT: |
311 | continue; | 358 | chosen = p; |
312 | 359 | chosen_points = ULONG_MAX; | |
313 | /* | 360 | /* fall through */ |
314 | * This task already has access to memory reserves and is | 361 | case OOM_SCAN_CONTINUE: |
315 | * being killed. Don't allow any other task access to the | ||
316 | * memory reserve. | ||
317 | * | ||
318 | * Note: this may have a chance of deadlock if it gets | ||
319 | * blocked waiting for another task which itself is waiting | ||
320 | * for memory. Is there a better alternative? | ||
321 | */ | ||
322 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) { | ||
323 | if (unlikely(frozen(p))) | ||
324 | __thaw_task(p); | ||
325 | if (!force_kill) | ||
326 | return ERR_PTR(-1UL); | ||
327 | } | ||
328 | if (!p->mm) | ||
329 | continue; | 362 | continue; |
330 | 363 | case OOM_SCAN_ABORT: | |
331 | if (p->flags & PF_EXITING) { | 364 | rcu_read_unlock(); |
332 | /* | 365 | return ERR_PTR(-1UL); |
333 | * If p is the current task and is in the process of | 366 | case OOM_SCAN_OK: |
334 | * releasing memory, we allow the "kill" to set | 367 | break; |
335 | * TIF_MEMDIE, which will allow it to gain access to | 368 | }; |
336 | * memory reserves. Otherwise, it may stall forever. | 369 | points = oom_badness(p, NULL, nodemask, totalpages); |
337 | * | ||
338 | * The loop isn't broken here, however, in case other | ||
339 | * threads are found to have already been oom killed. | ||
340 | */ | ||
341 | if (p == current) { | ||
342 | chosen = p; | ||
343 | chosen_points = ULONG_MAX; | ||
344 | } else if (!force_kill) { | ||
345 | /* | ||
346 | * If this task is not being ptraced on exit, | ||
347 | * then wait for it to finish before killing | ||
348 | * some other task unnecessarily. | ||
349 | */ | ||
350 | if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) | ||
351 | return ERR_PTR(-1UL); | ||
352 | } | ||
353 | } | ||
354 | |||
355 | points = oom_badness(p, memcg, nodemask, totalpages); | ||
356 | if (points > chosen_points) { | 370 | if (points > chosen_points) { |
357 | chosen = p; | 371 | chosen = p; |
358 | chosen_points = points; | 372 | chosen_points = points; |
359 | } | 373 | } |
360 | } while_each_thread(g, p); | 374 | } while_each_thread(g, p); |
375 | if (chosen) | ||
376 | get_task_struct(chosen); | ||
377 | rcu_read_unlock(); | ||
361 | 378 | ||
362 | *ppoints = chosen_points * 1000 / totalpages; | 379 | *ppoints = chosen_points * 1000 / totalpages; |
363 | return chosen; | 380 | return chosen; |
@@ -371,17 +388,16 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
371 | * Dumps the current memory state of all eligible tasks. Tasks not in the same | 388 | * Dumps the current memory state of all eligible tasks. Tasks not in the same |
372 | * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes | 389 | * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes |
373 | * are not shown. | 390 | * are not shown. |
374 | * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj | 391 | * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, |
375 | * value, oom_score_adj value, and name. | 392 | * swapents, oom_score_adj value, and name. |
376 | * | ||
377 | * Call with tasklist_lock read-locked. | ||
378 | */ | 393 | */ |
379 | static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) | 394 | static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) |
380 | { | 395 | { |
381 | struct task_struct *p; | 396 | struct task_struct *p; |
382 | struct task_struct *task; | 397 | struct task_struct *task; |
383 | 398 | ||
384 | pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); | 399 | pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); |
400 | rcu_read_lock(); | ||
385 | for_each_process(p) { | 401 | for_each_process(p) { |
386 | if (oom_unkillable_task(p, memcg, nodemask)) | 402 | if (oom_unkillable_task(p, memcg, nodemask)) |
387 | continue; | 403 | continue; |
@@ -396,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas | |||
396 | continue; | 412 | continue; |
397 | } | 413 | } |
398 | 414 | ||
399 | pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", | 415 | pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", |
400 | task->pid, from_kuid(&init_user_ns, task_uid(task)), | 416 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
401 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), | 417 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
402 | task_cpu(task), task->signal->oom_adj, | 418 | task->mm->nr_ptes, |
419 | get_mm_counter(task->mm, MM_SWAPENTS), | ||
403 | task->signal->oom_score_adj, task->comm); | 420 | task->signal->oom_score_adj, task->comm); |
404 | task_unlock(task); | 421 | task_unlock(task); |
405 | } | 422 | } |
423 | rcu_read_unlock(); | ||
406 | } | 424 | } |
407 | 425 | ||
408 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | 426 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, |
@@ -423,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
423 | } | 441 | } |
424 | 442 | ||
425 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 443 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
426 | static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | 444 | /* |
427 | unsigned int points, unsigned long totalpages, | 445 | * Must be called while holding a reference to p, which will be released upon |
428 | struct mem_cgroup *memcg, nodemask_t *nodemask, | 446 | * returning. |
429 | const char *message) | 447 | */ |
448 | void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | ||
449 | unsigned int points, unsigned long totalpages, | ||
450 | struct mem_cgroup *memcg, nodemask_t *nodemask, | ||
451 | const char *message) | ||
430 | { | 452 | { |
431 | struct task_struct *victim = p; | 453 | struct task_struct *victim = p; |
432 | struct task_struct *child; | 454 | struct task_struct *child; |
@@ -442,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
442 | */ | 464 | */ |
443 | if (p->flags & PF_EXITING) { | 465 | if (p->flags & PF_EXITING) { |
444 | set_tsk_thread_flag(p, TIF_MEMDIE); | 466 | set_tsk_thread_flag(p, TIF_MEMDIE); |
467 | put_task_struct(p); | ||
445 | return; | 468 | return; |
446 | } | 469 | } |
447 | 470 | ||
@@ -459,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
459 | * parent. This attempts to lose the minimal amount of work done while | 482 | * parent. This attempts to lose the minimal amount of work done while |
460 | * still freeing memory. | 483 | * still freeing memory. |
461 | */ | 484 | */ |
485 | read_lock(&tasklist_lock); | ||
462 | do { | 486 | do { |
463 | list_for_each_entry(child, &t->children, sibling) { | 487 | list_for_each_entry(child, &t->children, sibling) { |
464 | unsigned int child_points; | 488 | unsigned int child_points; |
@@ -471,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
471 | child_points = oom_badness(child, memcg, nodemask, | 495 | child_points = oom_badness(child, memcg, nodemask, |
472 | totalpages); | 496 | totalpages); |
473 | if (child_points > victim_points) { | 497 | if (child_points > victim_points) { |
498 | put_task_struct(victim); | ||
474 | victim = child; | 499 | victim = child; |
475 | victim_points = child_points; | 500 | victim_points = child_points; |
501 | get_task_struct(victim); | ||
476 | } | 502 | } |
477 | } | 503 | } |
478 | } while_each_thread(p, t); | 504 | } while_each_thread(p, t); |
505 | read_unlock(&tasklist_lock); | ||
479 | 506 | ||
480 | victim = find_lock_task_mm(victim); | 507 | rcu_read_lock(); |
481 | if (!victim) | 508 | p = find_lock_task_mm(victim); |
509 | if (!p) { | ||
510 | rcu_read_unlock(); | ||
511 | put_task_struct(victim); | ||
482 | return; | 512 | return; |
513 | } else if (victim != p) { | ||
514 | get_task_struct(p); | ||
515 | put_task_struct(victim); | ||
516 | victim = p; | ||
517 | } | ||
483 | 518 | ||
484 | /* mm cannot safely be dereferenced after task_unlock(victim) */ | 519 | /* mm cannot safely be dereferenced after task_unlock(victim) */ |
485 | mm = victim->mm; | 520 | mm = victim->mm; |
@@ -510,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
510 | task_unlock(p); | 545 | task_unlock(p); |
511 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); | 546 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); |
512 | } | 547 | } |
548 | rcu_read_unlock(); | ||
513 | 549 | ||
514 | set_tsk_thread_flag(victim, TIF_MEMDIE); | 550 | set_tsk_thread_flag(victim, TIF_MEMDIE); |
515 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); | 551 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); |
552 | put_task_struct(victim); | ||
516 | } | 553 | } |
517 | #undef K | 554 | #undef K |
518 | 555 | ||
519 | /* | 556 | /* |
520 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. | 557 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. |
521 | */ | 558 | */ |
522 | static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | 559 | void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, |
523 | int order, const nodemask_t *nodemask) | 560 | int order, const nodemask_t *nodemask) |
524 | { | 561 | { |
525 | if (likely(!sysctl_panic_on_oom)) | 562 | if (likely(!sysctl_panic_on_oom)) |
526 | return; | 563 | return; |
@@ -533,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | |||
533 | if (constraint != CONSTRAINT_NONE) | 570 | if (constraint != CONSTRAINT_NONE) |
534 | return; | 571 | return; |
535 | } | 572 | } |
536 | read_lock(&tasklist_lock); | ||
537 | dump_header(NULL, gfp_mask, order, NULL, nodemask); | 573 | dump_header(NULL, gfp_mask, order, NULL, nodemask); |
538 | read_unlock(&tasklist_lock); | ||
539 | panic("Out of memory: %s panic_on_oom is enabled\n", | 574 | panic("Out of memory: %s panic_on_oom is enabled\n", |
540 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); | 575 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); |
541 | } | 576 | } |
542 | 577 | ||
543 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
544 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | ||
545 | int order) | ||
546 | { | ||
547 | unsigned long limit; | ||
548 | unsigned int points = 0; | ||
549 | struct task_struct *p; | ||
550 | |||
551 | /* | ||
552 | * If current has a pending SIGKILL, then automatically select it. The | ||
553 | * goal is to allow it to allocate so that it may quickly exit and free | ||
554 | * its memory. | ||
555 | */ | ||
556 | if (fatal_signal_pending(current)) { | ||
557 | set_thread_flag(TIF_MEMDIE); | ||
558 | return; | ||
559 | } | ||
560 | |||
561 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | ||
562 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; | ||
563 | read_lock(&tasklist_lock); | ||
564 | p = select_bad_process(&points, limit, memcg, NULL, false); | ||
565 | if (p && PTR_ERR(p) != -1UL) | ||
566 | oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL, | ||
567 | "Memory cgroup out of memory"); | ||
568 | read_unlock(&tasklist_lock); | ||
569 | } | ||
570 | #endif | ||
571 | |||
572 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); | 578 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); |
573 | 579 | ||
574 | int register_oom_notifier(struct notifier_block *nb) | 580 | int register_oom_notifier(struct notifier_block *nb) |
@@ -690,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
690 | struct task_struct *p; | 696 | struct task_struct *p; |
691 | unsigned long totalpages; | 697 | unsigned long totalpages; |
692 | unsigned long freed = 0; | 698 | unsigned long freed = 0; |
693 | unsigned int points; | 699 | unsigned int uninitialized_var(points); |
694 | enum oom_constraint constraint = CONSTRAINT_NONE; | 700 | enum oom_constraint constraint = CONSTRAINT_NONE; |
695 | int killed = 0; | 701 | int killed = 0; |
696 | 702 | ||
@@ -718,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
718 | mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; | 724 | mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; |
719 | check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); | 725 | check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); |
720 | 726 | ||
721 | read_lock(&tasklist_lock); | 727 | if (sysctl_oom_kill_allocating_task && current->mm && |
722 | if (sysctl_oom_kill_allocating_task && | ||
723 | !oom_unkillable_task(current, NULL, nodemask) && | 728 | !oom_unkillable_task(current, NULL, nodemask) && |
724 | current->mm) { | 729 | current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { |
730 | get_task_struct(current); | ||
725 | oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, | 731 | oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, |
726 | nodemask, | 732 | nodemask, |
727 | "Out of memory (oom_kill_allocating_task)"); | 733 | "Out of memory (oom_kill_allocating_task)"); |
728 | goto out; | 734 | goto out; |
729 | } | 735 | } |
730 | 736 | ||
731 | p = select_bad_process(&points, totalpages, NULL, mpol_mask, | 737 | p = select_bad_process(&points, totalpages, mpol_mask, force_kill); |
732 | force_kill); | ||
733 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 738 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
734 | if (!p) { | 739 | if (!p) { |
735 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); | 740 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); |
736 | read_unlock(&tasklist_lock); | ||
737 | panic("Out of memory and no killable processes...\n"); | 741 | panic("Out of memory and no killable processes...\n"); |
738 | } | 742 | } |
739 | if (PTR_ERR(p) != -1UL) { | 743 | if (PTR_ERR(p) != -1UL) { |
@@ -742,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
742 | killed = 1; | 746 | killed = 1; |
743 | } | 747 | } |
744 | out: | 748 | out: |
745 | read_unlock(&tasklist_lock); | ||
746 | |||
747 | /* | 749 | /* |
748 | * Give "p" a good chance of killing itself before we | 750 | * Give the killed threads a good chance of exiting before trying to |
749 | * retry to allocate memory unless "p" is current | 751 | * allocate memory again. |
750 | */ | 752 | */ |
751 | if (killed && !test_thread_flag(TIF_MEMDIE)) | 753 | if (killed) |
752 | schedule_timeout_uninterruptible(1); | 754 | schedule_timeout_killable(1); |
753 | } | 755 | } |
754 | 756 | ||
755 | /* | 757 | /* |
@@ -764,6 +766,5 @@ void pagefault_out_of_memory(void) | |||
764 | out_of_memory(NULL, 0, 0, NULL, false); | 766 | out_of_memory(NULL, 0, 0, NULL, false); |
765 | clear_system_oom(); | 767 | clear_system_oom(); |
766 | } | 768 | } |
767 | if (!test_thread_flag(TIF_MEMDIE)) | 769 | schedule_timeout_killable(1); |
768 | schedule_timeout_uninterruptible(1); | ||
769 | } | 770 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4a4f9219683f..889532b8e6c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -51,7 +51,6 @@ | |||
51 | #include <linux/page_cgroup.h> | 51 | #include <linux/page_cgroup.h> |
52 | #include <linux/debugobjects.h> | 52 | #include <linux/debugobjects.h> |
53 | #include <linux/kmemleak.h> | 53 | #include <linux/kmemleak.h> |
54 | #include <linux/memory.h> | ||
55 | #include <linux/compaction.h> | 54 | #include <linux/compaction.h> |
56 | #include <trace/events/kmem.h> | 55 | #include <trace/events/kmem.h> |
57 | #include <linux/ftrace_event.h> | 56 | #include <linux/ftrace_event.h> |
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes); | |||
219 | 218 | ||
220 | int page_group_by_mobility_disabled __read_mostly; | 219 | int page_group_by_mobility_disabled __read_mostly; |
221 | 220 | ||
222 | static void set_pageblock_migratetype(struct page *page, int migratetype) | 221 | /* |
222 | * NOTE: | ||
223 | * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. | ||
224 | * Instead, use {un}set_pageblock_isolate. | ||
225 | */ | ||
226 | void set_pageblock_migratetype(struct page *page, int migratetype) | ||
223 | { | 227 | { |
224 | 228 | ||
225 | if (unlikely(page_group_by_mobility_disabled)) | 229 | if (unlikely(page_group_by_mobility_disabled)) |
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone, | |||
954 | return pages_moved; | 958 | return pages_moved; |
955 | } | 959 | } |
956 | 960 | ||
957 | static int move_freepages_block(struct zone *zone, struct page *page, | 961 | int move_freepages_block(struct zone *zone, struct page *page, |
958 | int migratetype) | 962 | int migratetype) |
959 | { | 963 | { |
960 | unsigned long start_pfn, end_pfn; | 964 | unsigned long start_pfn, end_pfn; |
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
1158 | to_drain = pcp->batch; | 1162 | to_drain = pcp->batch; |
1159 | else | 1163 | else |
1160 | to_drain = pcp->count; | 1164 | to_drain = pcp->count; |
1161 | free_pcppages_bulk(zone, to_drain, pcp); | 1165 | if (to_drain > 0) { |
1162 | pcp->count -= to_drain; | 1166 | free_pcppages_bulk(zone, to_drain, pcp); |
1167 | pcp->count -= to_drain; | ||
1168 | } | ||
1163 | local_irq_restore(flags); | 1169 | local_irq_restore(flags); |
1164 | } | 1170 | } |
1165 | #endif | 1171 | #endif |
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str) | |||
1529 | } | 1535 | } |
1530 | __setup("fail_page_alloc=", setup_fail_page_alloc); | 1536 | __setup("fail_page_alloc=", setup_fail_page_alloc); |
1531 | 1537 | ||
1532 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1538 | static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1533 | { | 1539 | { |
1534 | if (order < fail_page_alloc.min_order) | 1540 | if (order < fail_page_alloc.min_order) |
1535 | return 0; | 1541 | return false; |
1536 | if (gfp_mask & __GFP_NOFAIL) | 1542 | if (gfp_mask & __GFP_NOFAIL) |
1537 | return 0; | 1543 | return false; |
1538 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | 1544 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) |
1539 | return 0; | 1545 | return false; |
1540 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) | 1546 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) |
1541 | return 0; | 1547 | return false; |
1542 | 1548 | ||
1543 | return should_fail(&fail_page_alloc.attr, 1 << order); | 1549 | return should_fail(&fail_page_alloc.attr, 1 << order); |
1544 | } | 1550 | } |
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs); | |||
1578 | 1584 | ||
1579 | #else /* CONFIG_FAIL_PAGE_ALLOC */ | 1585 | #else /* CONFIG_FAIL_PAGE_ALLOC */ |
1580 | 1586 | ||
1581 | static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1587 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1582 | { | 1588 | { |
1583 | return 0; | 1589 | return false; |
1584 | } | 1590 | } |
1585 | 1591 | ||
1586 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1592 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1594 | { | 1600 | { |
1595 | /* free_pages my go negative - that's OK */ | 1601 | /* free_pages my go negative - that's OK */ |
1596 | long min = mark; | 1602 | long min = mark; |
1603 | long lowmem_reserve = z->lowmem_reserve[classzone_idx]; | ||
1597 | int o; | 1604 | int o; |
1598 | 1605 | ||
1599 | free_pages -= (1 << order) - 1; | 1606 | free_pages -= (1 << order) - 1; |
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1602 | if (alloc_flags & ALLOC_HARDER) | 1609 | if (alloc_flags & ALLOC_HARDER) |
1603 | min -= min / 4; | 1610 | min -= min / 4; |
1604 | 1611 | ||
1605 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 1612 | if (free_pages <= min + lowmem_reserve) |
1606 | return false; | 1613 | return false; |
1607 | for (o = 0; o < order; o++) { | 1614 | for (o = 0; o < order; o++) { |
1608 | /* At the next order, this order's pages become unavailable */ | 1615 | /* At the next order, this order's pages become unavailable */ |
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1617 | return true; | 1624 | return true; |
1618 | } | 1625 | } |
1619 | 1626 | ||
1627 | #ifdef CONFIG_MEMORY_ISOLATION | ||
1628 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1629 | { | ||
1630 | if (unlikely(zone->nr_pageblock_isolate)) | ||
1631 | return zone->nr_pageblock_isolate * pageblock_nr_pages; | ||
1632 | return 0; | ||
1633 | } | ||
1634 | #else | ||
1635 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1636 | { | ||
1637 | return 0; | ||
1638 | } | ||
1639 | #endif | ||
1640 | |||
1620 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1641 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1621 | int classzone_idx, int alloc_flags) | 1642 | int classzone_idx, int alloc_flags) |
1622 | { | 1643 | { |
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1632 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | 1653 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) |
1633 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | 1654 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); |
1634 | 1655 | ||
1656 | /* | ||
1657 | * If the zone has MIGRATE_ISOLATE type free pages, we should consider | ||
1658 | * it. nr_zone_isolate_freepages is never accurate so kswapd might not | ||
1659 | * sleep although it could do so. But this is more desirable for memory | ||
1660 | * hotplug than sleeping which can cause a livelock in the direct | ||
1661 | * reclaim path. | ||
1662 | */ | ||
1663 | free_pages -= nr_zone_isolate_freepages(z); | ||
1635 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1664 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1636 | free_pages); | 1665 | free_pages); |
1637 | } | 1666 | } |
@@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2087 | 2116 | ||
2088 | page = get_page_from_freelist(gfp_mask, nodemask, | 2117 | page = get_page_from_freelist(gfp_mask, nodemask, |
2089 | order, zonelist, high_zoneidx, | 2118 | order, zonelist, high_zoneidx, |
2090 | alloc_flags, preferred_zone, | 2119 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2091 | migratetype); | 2120 | preferred_zone, migratetype); |
2092 | if (page) { | 2121 | if (page) { |
2093 | preferred_zone->compact_considered = 0; | 2122 | preferred_zone->compact_considered = 0; |
2094 | preferred_zone->compact_defer_shift = 0; | 2123 | preferred_zone->compact_defer_shift = 0; |
@@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2180 | retry: | 2209 | retry: |
2181 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2210 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2182 | zonelist, high_zoneidx, | 2211 | zonelist, high_zoneidx, |
2183 | alloc_flags, preferred_zone, | 2212 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2184 | migratetype); | 2213 | preferred_zone, migratetype); |
2185 | 2214 | ||
2186 | /* | 2215 | /* |
2187 | * If an allocation failed after direct reclaim, it could be because | 2216 | * If an allocation failed after direct reclaim, it could be because |
@@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
2265 | alloc_flags |= ALLOC_HARDER; | 2294 | alloc_flags |= ALLOC_HARDER; |
2266 | 2295 | ||
2267 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | 2296 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { |
2268 | if (!in_interrupt() && | 2297 | if (gfp_mask & __GFP_MEMALLOC) |
2269 | ((current->flags & PF_MEMALLOC) || | 2298 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2270 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 2299 | else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) |
2300 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
2301 | else if (!in_interrupt() && | ||
2302 | ((current->flags & PF_MEMALLOC) || | ||
2303 | unlikely(test_thread_flag(TIF_MEMDIE)))) | ||
2271 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2304 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2272 | } | 2305 | } |
2273 | 2306 | ||
2274 | return alloc_flags; | 2307 | return alloc_flags; |
2275 | } | 2308 | } |
2276 | 2309 | ||
2310 | bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | ||
2311 | { | ||
2312 | return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); | ||
2313 | } | ||
2314 | |||
2277 | static inline struct page * | 2315 | static inline struct page * |
2278 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2316 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2279 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2317 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
@@ -2340,11 +2378,27 @@ rebalance: | |||
2340 | 2378 | ||
2341 | /* Allocate without watermarks if the context allows */ | 2379 | /* Allocate without watermarks if the context allows */ |
2342 | if (alloc_flags & ALLOC_NO_WATERMARKS) { | 2380 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
2381 | /* | ||
2382 | * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds | ||
2383 | * the allocation is high priority and these type of | ||
2384 | * allocations are system rather than user orientated | ||
2385 | */ | ||
2386 | zonelist = node_zonelist(numa_node_id(), gfp_mask); | ||
2387 | |||
2343 | page = __alloc_pages_high_priority(gfp_mask, order, | 2388 | page = __alloc_pages_high_priority(gfp_mask, order, |
2344 | zonelist, high_zoneidx, nodemask, | 2389 | zonelist, high_zoneidx, nodemask, |
2345 | preferred_zone, migratetype); | 2390 | preferred_zone, migratetype); |
2346 | if (page) | 2391 | if (page) { |
2392 | /* | ||
2393 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was | ||
2394 | * necessary to allocate the page. The expectation is | ||
2395 | * that the caller is taking steps that will free more | ||
2396 | * memory. The caller should avoid the page being used | ||
2397 | * for !PFMEMALLOC purposes. | ||
2398 | */ | ||
2399 | page->pfmemalloc = true; | ||
2347 | goto got_pg; | 2400 | goto got_pg; |
2401 | } | ||
2348 | } | 2402 | } |
2349 | 2403 | ||
2350 | /* Atomic allocations - we can't balance anything */ | 2404 | /* Atomic allocations - we can't balance anything */ |
@@ -2463,8 +2517,8 @@ nopage: | |||
2463 | got_pg: | 2517 | got_pg: |
2464 | if (kmemcheck_enabled) | 2518 | if (kmemcheck_enabled) |
2465 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | 2519 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); |
2466 | return page; | ||
2467 | 2520 | ||
2521 | return page; | ||
2468 | } | 2522 | } |
2469 | 2523 | ||
2470 | /* | 2524 | /* |
@@ -2515,6 +2569,8 @@ retry_cpuset: | |||
2515 | page = __alloc_pages_slowpath(gfp_mask, order, | 2569 | page = __alloc_pages_slowpath(gfp_mask, order, |
2516 | zonelist, high_zoneidx, nodemask, | 2570 | zonelist, high_zoneidx, nodemask, |
2517 | preferred_zone, migratetype); | 2571 | preferred_zone, migratetype); |
2572 | else | ||
2573 | page->pfmemalloc = false; | ||
2518 | 2574 | ||
2519 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2575 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
2520 | 2576 | ||
@@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
3030 | user_zonelist_order = oldval; | 3086 | user_zonelist_order = oldval; |
3031 | } else if (oldval != user_zonelist_order) { | 3087 | } else if (oldval != user_zonelist_order) { |
3032 | mutex_lock(&zonelists_mutex); | 3088 | mutex_lock(&zonelists_mutex); |
3033 | build_all_zonelists(NULL); | 3089 | build_all_zonelists(NULL, NULL); |
3034 | mutex_unlock(&zonelists_mutex); | 3090 | mutex_unlock(&zonelists_mutex); |
3035 | } | 3091 | } |
3036 | } | 3092 | } |
@@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone); | |||
3409 | DEFINE_MUTEX(zonelists_mutex); | 3465 | DEFINE_MUTEX(zonelists_mutex); |
3410 | 3466 | ||
3411 | /* return values int ....just for stop_machine() */ | 3467 | /* return values int ....just for stop_machine() */ |
3412 | static __init_refok int __build_all_zonelists(void *data) | 3468 | static int __build_all_zonelists(void *data) |
3413 | { | 3469 | { |
3414 | int nid; | 3470 | int nid; |
3415 | int cpu; | 3471 | int cpu; |
3472 | pg_data_t *self = data; | ||
3416 | 3473 | ||
3417 | #ifdef CONFIG_NUMA | 3474 | #ifdef CONFIG_NUMA |
3418 | memset(node_load, 0, sizeof(node_load)); | 3475 | memset(node_load, 0, sizeof(node_load)); |
3419 | #endif | 3476 | #endif |
3477 | |||
3478 | if (self && !node_online(self->node_id)) { | ||
3479 | build_zonelists(self); | ||
3480 | build_zonelist_cache(self); | ||
3481 | } | ||
3482 | |||
3420 | for_each_online_node(nid) { | 3483 | for_each_online_node(nid) { |
3421 | pg_data_t *pgdat = NODE_DATA(nid); | 3484 | pg_data_t *pgdat = NODE_DATA(nid); |
3422 | 3485 | ||
@@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data) | |||
3461 | * Called with zonelists_mutex held always | 3524 | * Called with zonelists_mutex held always |
3462 | * unless system_state == SYSTEM_BOOTING. | 3525 | * unless system_state == SYSTEM_BOOTING. |
3463 | */ | 3526 | */ |
3464 | void __ref build_all_zonelists(void *data) | 3527 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) |
3465 | { | 3528 | { |
3466 | set_zonelist_order(); | 3529 | set_zonelist_order(); |
3467 | 3530 | ||
@@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data) | |||
3473 | /* we have to stop all cpus to guarantee there is no user | 3536 | /* we have to stop all cpus to guarantee there is no user |
3474 | of zonelist */ | 3537 | of zonelist */ |
3475 | #ifdef CONFIG_MEMORY_HOTPLUG | 3538 | #ifdef CONFIG_MEMORY_HOTPLUG |
3476 | if (data) | 3539 | if (zone) |
3477 | setup_zone_pageset((struct zone *)data); | 3540 | setup_zone_pageset(zone); |
3478 | #endif | 3541 | #endif |
3479 | stop_machine(__build_all_zonelists, NULL, NULL); | 3542 | stop_machine(__build_all_zonelists, pgdat, NULL); |
3480 | /* cpuset refresh routine should be here */ | 3543 | /* cpuset refresh routine should be here */ |
3481 | } | 3544 | } |
3482 | vm_total_pages = nr_free_pagecache_pages(); | 3545 | vm_total_pages = nr_free_pagecache_pages(); |
@@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) | |||
3746 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) | 3809 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) |
3747 | #endif | 3810 | #endif |
3748 | 3811 | ||
3749 | static int zone_batchsize(struct zone *zone) | 3812 | static int __meminit zone_batchsize(struct zone *zone) |
3750 | { | 3813 | { |
3751 | #ifdef CONFIG_MMU | 3814 | #ifdef CONFIG_MMU |
3752 | int batch; | 3815 | int batch; |
@@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
3828 | pcp->batch = PAGE_SHIFT * 8; | 3891 | pcp->batch = PAGE_SHIFT * 8; |
3829 | } | 3892 | } |
3830 | 3893 | ||
3831 | static void setup_zone_pageset(struct zone *zone) | 3894 | static void __meminit setup_zone_pageset(struct zone *zone) |
3832 | { | 3895 | { |
3833 | int cpu; | 3896 | int cpu; |
3834 | 3897 | ||
@@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
3901 | return 0; | 3964 | return 0; |
3902 | } | 3965 | } |
3903 | 3966 | ||
3904 | static int __zone_pcp_update(void *data) | ||
3905 | { | ||
3906 | struct zone *zone = data; | ||
3907 | int cpu; | ||
3908 | unsigned long batch = zone_batchsize(zone), flags; | ||
3909 | |||
3910 | for_each_possible_cpu(cpu) { | ||
3911 | struct per_cpu_pageset *pset; | ||
3912 | struct per_cpu_pages *pcp; | ||
3913 | |||
3914 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
3915 | pcp = &pset->pcp; | ||
3916 | |||
3917 | local_irq_save(flags); | ||
3918 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
3919 | setup_pageset(pset, batch); | ||
3920 | local_irq_restore(flags); | ||
3921 | } | ||
3922 | return 0; | ||
3923 | } | ||
3924 | |||
3925 | void zone_pcp_update(struct zone *zone) | ||
3926 | { | ||
3927 | stop_machine(__zone_pcp_update, zone, NULL); | ||
3928 | } | ||
3929 | |||
3930 | static __meminit void zone_pcp_init(struct zone *zone) | 3967 | static __meminit void zone_pcp_init(struct zone *zone) |
3931 | { | 3968 | { |
3932 | /* | 3969 | /* |
@@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
3942 | zone_batchsize(zone)); | 3979 | zone_batchsize(zone)); |
3943 | } | 3980 | } |
3944 | 3981 | ||
3945 | __meminit int init_currently_empty_zone(struct zone *zone, | 3982 | int __meminit init_currently_empty_zone(struct zone *zone, |
3946 | unsigned long zone_start_pfn, | 3983 | unsigned long zone_start_pfn, |
3947 | unsigned long size, | 3984 | unsigned long size, |
3948 | enum memmap_context context) | 3985 | enum memmap_context context) |
@@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, | |||
4301 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 4338 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
4302 | 4339 | ||
4303 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | 4340 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ |
4304 | static inline void __init set_pageblock_order(void) | 4341 | void __init set_pageblock_order(void) |
4305 | { | 4342 | { |
4306 | unsigned int order; | 4343 | unsigned int order; |
4307 | 4344 | ||
@@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void) | |||
4329 | * include/linux/pageblock-flags.h for the values of pageblock_order based on | 4366 | * include/linux/pageblock-flags.h for the values of pageblock_order based on |
4330 | * the kernel config | 4367 | * the kernel config |
4331 | */ | 4368 | */ |
4332 | static inline void set_pageblock_order(void) | 4369 | void __init set_pageblock_order(void) |
4333 | { | 4370 | { |
4334 | } | 4371 | } |
4335 | 4372 | ||
@@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void) | |||
4340 | * - mark all pages reserved | 4377 | * - mark all pages reserved |
4341 | * - mark all memory queues empty | 4378 | * - mark all memory queues empty |
4342 | * - clear the memory bitmaps | 4379 | * - clear the memory bitmaps |
4380 | * | ||
4381 | * NOTE: pgdat should get zeroed by caller. | ||
4343 | */ | 4382 | */ |
4344 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, | 4383 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, |
4345 | unsigned long *zones_size, unsigned long *zholes_size) | 4384 | unsigned long *zones_size, unsigned long *zholes_size) |
@@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4350 | int ret; | 4389 | int ret; |
4351 | 4390 | ||
4352 | pgdat_resize_init(pgdat); | 4391 | pgdat_resize_init(pgdat); |
4353 | pgdat->nr_zones = 0; | ||
4354 | init_waitqueue_head(&pgdat->kswapd_wait); | 4392 | init_waitqueue_head(&pgdat->kswapd_wait); |
4355 | pgdat->kswapd_max_order = 0; | 4393 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4356 | pgdat_page_cgroup_init(pgdat); | 4394 | pgdat_page_cgroup_init(pgdat); |
4357 | 4395 | ||
4358 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4396 | for (j = 0; j < MAX_NR_ZONES; j++) { |
@@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4394 | 4432 | ||
4395 | zone->spanned_pages = size; | 4433 | zone->spanned_pages = size; |
4396 | zone->present_pages = realsize; | 4434 | zone->present_pages = realsize; |
4435 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
4436 | zone->compact_cached_free_pfn = zone->zone_start_pfn + | ||
4437 | zone->spanned_pages; | ||
4438 | zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); | ||
4439 | #endif | ||
4397 | #ifdef CONFIG_NUMA | 4440 | #ifdef CONFIG_NUMA |
4398 | zone->node = nid; | 4441 | zone->node = nid; |
4399 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4442 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) |
@@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4408 | 4451 | ||
4409 | zone_pcp_init(zone); | 4452 | zone_pcp_init(zone); |
4410 | lruvec_init(&zone->lruvec, zone); | 4453 | lruvec_init(&zone->lruvec, zone); |
4411 | zap_zone_vm_stats(zone); | ||
4412 | zone->flags = 0; | ||
4413 | if (!size) | 4454 | if (!size) |
4414 | continue; | 4455 | continue; |
4415 | 4456 | ||
@@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4469 | { | 4510 | { |
4470 | pg_data_t *pgdat = NODE_DATA(nid); | 4511 | pg_data_t *pgdat = NODE_DATA(nid); |
4471 | 4512 | ||
4513 | /* pg_data_t should be reset to zero when it's allocated */ | ||
4514 | WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx); | ||
4515 | |||
4472 | pgdat->node_id = nid; | 4516 | pgdat->node_id = nid; |
4473 | pgdat->node_start_pfn = node_start_pfn; | 4517 | pgdat->node_start_pfn = node_start_pfn; |
4474 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 4518 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
@@ -4750,7 +4794,7 @@ out: | |||
4750 | } | 4794 | } |
4751 | 4795 | ||
4752 | /* Any regular memory on that node ? */ | 4796 | /* Any regular memory on that node ? */ |
4753 | static void check_for_regular_memory(pg_data_t *pgdat) | 4797 | static void __init check_for_regular_memory(pg_data_t *pgdat) |
4754 | { | 4798 | { |
4755 | #ifdef CONFIG_HIGHMEM | 4799 | #ifdef CONFIG_HIGHMEM |
4756 | enum zone_type zone_type; | 4800 | enum zone_type zone_type; |
@@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5468 | } | 5512 | } |
5469 | 5513 | ||
5470 | /* | 5514 | /* |
5471 | * This is designed as sub function...plz see page_isolation.c also. | 5515 | * This function checks whether pageblock includes unmovable pages or not. |
5472 | * set/clear page block's type to be ISOLATE. | 5516 | * If @count is not zero, it is okay to include less @count unmovable pages |
5473 | * page allocater never alloc memory from ISOLATE block. | 5517 | * |
5518 | * PageLRU check wihtout isolation or lru_lock could race so that | ||
5519 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't | ||
5520 | * expect this function should be exact. | ||
5474 | */ | 5521 | */ |
5475 | 5522 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | |
5476 | static int | ||
5477 | __count_immobile_pages(struct zone *zone, struct page *page, int count) | ||
5478 | { | 5523 | { |
5479 | unsigned long pfn, iter, found; | 5524 | unsigned long pfn, iter, found; |
5480 | int mt; | 5525 | int mt; |
5481 | 5526 | ||
5482 | /* | 5527 | /* |
5483 | * For avoiding noise data, lru_add_drain_all() should be called | 5528 | * For avoiding noise data, lru_add_drain_all() should be called |
5484 | * If ZONE_MOVABLE, the zone never contains immobile pages | 5529 | * If ZONE_MOVABLE, the zone never contains unmovable pages |
5485 | */ | 5530 | */ |
5486 | if (zone_idx(zone) == ZONE_MOVABLE) | 5531 | if (zone_idx(zone) == ZONE_MOVABLE) |
5487 | return true; | 5532 | return false; |
5488 | mt = get_pageblock_migratetype(page); | 5533 | mt = get_pageblock_migratetype(page); |
5489 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) | 5534 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) |
5490 | return true; | 5535 | return false; |
5491 | 5536 | ||
5492 | pfn = page_to_pfn(page); | 5537 | pfn = page_to_pfn(page); |
5493 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { | 5538 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { |
@@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) | |||
5497 | continue; | 5542 | continue; |
5498 | 5543 | ||
5499 | page = pfn_to_page(check); | 5544 | page = pfn_to_page(check); |
5500 | if (!page_count(page)) { | 5545 | /* |
5546 | * We can't use page_count without pin a page | ||
5547 | * because another CPU can free compound page. | ||
5548 | * This check already skips compound tails of THP | ||
5549 | * because their page->_count is zero at all time. | ||
5550 | */ | ||
5551 | if (!atomic_read(&page->_count)) { | ||
5501 | if (PageBuddy(page)) | 5552 | if (PageBuddy(page)) |
5502 | iter += (1 << page_order(page)) - 1; | 5553 | iter += (1 << page_order(page)) - 1; |
5503 | continue; | 5554 | continue; |
5504 | } | 5555 | } |
5556 | |||
5505 | if (!PageLRU(page)) | 5557 | if (!PageLRU(page)) |
5506 | found++; | 5558 | found++; |
5507 | /* | 5559 | /* |
@@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) | |||
5518 | * page at boot. | 5570 | * page at boot. |
5519 | */ | 5571 | */ |
5520 | if (found > count) | 5572 | if (found > count) |
5521 | return false; | 5573 | return true; |
5522 | } | 5574 | } |
5523 | return true; | 5575 | return false; |
5524 | } | 5576 | } |
5525 | 5577 | ||
5526 | bool is_pageblock_removable_nolock(struct page *page) | 5578 | bool is_pageblock_removable_nolock(struct page *page) |
@@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
5544 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | 5596 | zone->zone_start_pfn + zone->spanned_pages <= pfn) |
5545 | return false; | 5597 | return false; |
5546 | 5598 | ||
5547 | return __count_immobile_pages(zone, page, 0); | 5599 | return !has_unmovable_pages(zone, page, 0); |
5548 | } | ||
5549 | |||
5550 | int set_migratetype_isolate(struct page *page) | ||
5551 | { | ||
5552 | struct zone *zone; | ||
5553 | unsigned long flags, pfn; | ||
5554 | struct memory_isolate_notify arg; | ||
5555 | int notifier_ret; | ||
5556 | int ret = -EBUSY; | ||
5557 | |||
5558 | zone = page_zone(page); | ||
5559 | |||
5560 | spin_lock_irqsave(&zone->lock, flags); | ||
5561 | |||
5562 | pfn = page_to_pfn(page); | ||
5563 | arg.start_pfn = pfn; | ||
5564 | arg.nr_pages = pageblock_nr_pages; | ||
5565 | arg.pages_found = 0; | ||
5566 | |||
5567 | /* | ||
5568 | * It may be possible to isolate a pageblock even if the | ||
5569 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | ||
5570 | * notifier chain is used by balloon drivers to return the | ||
5571 | * number of pages in a range that are held by the balloon | ||
5572 | * driver to shrink memory. If all the pages are accounted for | ||
5573 | * by balloons, are free, or on the LRU, isolation can continue. | ||
5574 | * Later, for example, when memory hotplug notifier runs, these | ||
5575 | * pages reported as "can be isolated" should be isolated(freed) | ||
5576 | * by the balloon driver through the memory notifier chain. | ||
5577 | */ | ||
5578 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); | ||
5579 | notifier_ret = notifier_to_errno(notifier_ret); | ||
5580 | if (notifier_ret) | ||
5581 | goto out; | ||
5582 | /* | ||
5583 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | ||
5584 | * We just check MOVABLE pages. | ||
5585 | */ | ||
5586 | if (__count_immobile_pages(zone, page, arg.pages_found)) | ||
5587 | ret = 0; | ||
5588 | |||
5589 | /* | ||
5590 | * immobile means "not-on-lru" paes. If immobile is larger than | ||
5591 | * removable-by-driver pages reported by notifier, we'll fail. | ||
5592 | */ | ||
5593 | |||
5594 | out: | ||
5595 | if (!ret) { | ||
5596 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
5597 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
5598 | } | ||
5599 | |||
5600 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5601 | if (!ret) | ||
5602 | drain_all_pages(); | ||
5603 | return ret; | ||
5604 | } | ||
5605 | |||
5606 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) | ||
5607 | { | ||
5608 | struct zone *zone; | ||
5609 | unsigned long flags; | ||
5610 | zone = page_zone(page); | ||
5611 | spin_lock_irqsave(&zone->lock, flags); | ||
5612 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
5613 | goto out; | ||
5614 | set_pageblock_migratetype(page, migratetype); | ||
5615 | move_freepages_block(zone, page, migratetype); | ||
5616 | out: | ||
5617 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5618 | } | 5600 | } |
5619 | 5601 | ||
5620 | #ifdef CONFIG_CMA | 5602 | #ifdef CONFIG_CMA |
@@ -5869,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) | |||
5869 | } | 5851 | } |
5870 | #endif | 5852 | #endif |
5871 | 5853 | ||
5854 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
5855 | static int __meminit __zone_pcp_update(void *data) | ||
5856 | { | ||
5857 | struct zone *zone = data; | ||
5858 | int cpu; | ||
5859 | unsigned long batch = zone_batchsize(zone), flags; | ||
5860 | |||
5861 | for_each_possible_cpu(cpu) { | ||
5862 | struct per_cpu_pageset *pset; | ||
5863 | struct per_cpu_pages *pcp; | ||
5864 | |||
5865 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
5866 | pcp = &pset->pcp; | ||
5867 | |||
5868 | local_irq_save(flags); | ||
5869 | if (pcp->count > 0) | ||
5870 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
5871 | setup_pageset(pset, batch); | ||
5872 | local_irq_restore(flags); | ||
5873 | } | ||
5874 | return 0; | ||
5875 | } | ||
5876 | |||
5877 | void __meminit zone_pcp_update(struct zone *zone) | ||
5878 | { | ||
5879 | stop_machine(__zone_pcp_update, zone, NULL); | ||
5880 | } | ||
5881 | #endif | ||
5882 | |||
5872 | #ifdef CONFIG_MEMORY_HOTREMOVE | 5883 | #ifdef CONFIG_MEMORY_HOTREMOVE |
5884 | void zone_pcp_reset(struct zone *zone) | ||
5885 | { | ||
5886 | unsigned long flags; | ||
5887 | |||
5888 | /* avoid races with drain_pages() */ | ||
5889 | local_irq_save(flags); | ||
5890 | if (zone->pageset != &boot_pageset) { | ||
5891 | free_percpu(zone->pageset); | ||
5892 | zone->pageset = &boot_pageset; | ||
5893 | } | ||
5894 | local_irq_restore(flags); | ||
5895 | } | ||
5896 | |||
5873 | /* | 5897 | /* |
5874 | * All pages in the range must be isolated before calling this. | 5898 | * All pages in the range must be isolated before calling this. |
5875 | */ | 5899 | */ |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index eb750f851395..5ddad0c6daa6 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | |||
317 | #endif | 317 | #endif |
318 | 318 | ||
319 | 319 | ||
320 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 320 | #ifdef CONFIG_MEMCG_SWAP |
321 | 321 | ||
322 | static DEFINE_MUTEX(swap_cgroup_mutex); | 322 | static DEFINE_MUTEX(swap_cgroup_mutex); |
323 | struct swap_cgroup_ctrl { | 323 | struct swap_cgroup_ctrl { |
diff --git a/mm/page_io.c b/mm/page_io.c index 34f02923744c..78eee32ee486 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
18 | #include <linux/bio.h> | 18 | #include <linux/bio.h> |
19 | #include <linux/swapops.h> | 19 | #include <linux/swapops.h> |
20 | #include <linux/buffer_head.h> | ||
20 | #include <linux/writeback.h> | 21 | #include <linux/writeback.h> |
21 | #include <linux/frontswap.h> | 22 | #include <linux/frontswap.h> |
22 | #include <asm/pgtable.h> | 23 | #include <asm/pgtable.h> |
@@ -86,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err) | |||
86 | bio_put(bio); | 87 | bio_put(bio); |
87 | } | 88 | } |
88 | 89 | ||
90 | int generic_swapfile_activate(struct swap_info_struct *sis, | ||
91 | struct file *swap_file, | ||
92 | sector_t *span) | ||
93 | { | ||
94 | struct address_space *mapping = swap_file->f_mapping; | ||
95 | struct inode *inode = mapping->host; | ||
96 | unsigned blocks_per_page; | ||
97 | unsigned long page_no; | ||
98 | unsigned blkbits; | ||
99 | sector_t probe_block; | ||
100 | sector_t last_block; | ||
101 | sector_t lowest_block = -1; | ||
102 | sector_t highest_block = 0; | ||
103 | int nr_extents = 0; | ||
104 | int ret; | ||
105 | |||
106 | blkbits = inode->i_blkbits; | ||
107 | blocks_per_page = PAGE_SIZE >> blkbits; | ||
108 | |||
109 | /* | ||
110 | * Map all the blocks into the extent list. This code doesn't try | ||
111 | * to be very smart. | ||
112 | */ | ||
113 | probe_block = 0; | ||
114 | page_no = 0; | ||
115 | last_block = i_size_read(inode) >> blkbits; | ||
116 | while ((probe_block + blocks_per_page) <= last_block && | ||
117 | page_no < sis->max) { | ||
118 | unsigned block_in_page; | ||
119 | sector_t first_block; | ||
120 | |||
121 | first_block = bmap(inode, probe_block); | ||
122 | if (first_block == 0) | ||
123 | goto bad_bmap; | ||
124 | |||
125 | /* | ||
126 | * It must be PAGE_SIZE aligned on-disk | ||
127 | */ | ||
128 | if (first_block & (blocks_per_page - 1)) { | ||
129 | probe_block++; | ||
130 | goto reprobe; | ||
131 | } | ||
132 | |||
133 | for (block_in_page = 1; block_in_page < blocks_per_page; | ||
134 | block_in_page++) { | ||
135 | sector_t block; | ||
136 | |||
137 | block = bmap(inode, probe_block + block_in_page); | ||
138 | if (block == 0) | ||
139 | goto bad_bmap; | ||
140 | if (block != first_block + block_in_page) { | ||
141 | /* Discontiguity */ | ||
142 | probe_block++; | ||
143 | goto reprobe; | ||
144 | } | ||
145 | } | ||
146 | |||
147 | first_block >>= (PAGE_SHIFT - blkbits); | ||
148 | if (page_no) { /* exclude the header page */ | ||
149 | if (first_block < lowest_block) | ||
150 | lowest_block = first_block; | ||
151 | if (first_block > highest_block) | ||
152 | highest_block = first_block; | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks | ||
157 | */ | ||
158 | ret = add_swap_extent(sis, page_no, 1, first_block); | ||
159 | if (ret < 0) | ||
160 | goto out; | ||
161 | nr_extents += ret; | ||
162 | page_no++; | ||
163 | probe_block += blocks_per_page; | ||
164 | reprobe: | ||
165 | continue; | ||
166 | } | ||
167 | ret = nr_extents; | ||
168 | *span = 1 + highest_block - lowest_block; | ||
169 | if (page_no == 0) | ||
170 | page_no = 1; /* force Empty message */ | ||
171 | sis->max = page_no; | ||
172 | sis->pages = page_no - 1; | ||
173 | sis->highest_bit = page_no - 1; | ||
174 | out: | ||
175 | return ret; | ||
176 | bad_bmap: | ||
177 | printk(KERN_ERR "swapon: swapfile has holes\n"); | ||
178 | ret = -EINVAL; | ||
179 | goto out; | ||
180 | } | ||
181 | |||
89 | /* | 182 | /* |
90 | * We may have stale swap cache pages in memory: notice | 183 | * We may have stale swap cache pages in memory: notice |
91 | * them here and get rid of the unnecessary final write. | 184 | * them here and get rid of the unnecessary final write. |
@@ -94,6 +187,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
94 | { | 187 | { |
95 | struct bio *bio; | 188 | struct bio *bio; |
96 | int ret = 0, rw = WRITE; | 189 | int ret = 0, rw = WRITE; |
190 | struct swap_info_struct *sis = page_swap_info(page); | ||
97 | 191 | ||
98 | if (try_to_free_swap(page)) { | 192 | if (try_to_free_swap(page)) { |
99 | unlock_page(page); | 193 | unlock_page(page); |
@@ -105,6 +199,33 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
105 | end_page_writeback(page); | 199 | end_page_writeback(page); |
106 | goto out; | 200 | goto out; |
107 | } | 201 | } |
202 | |||
203 | if (sis->flags & SWP_FILE) { | ||
204 | struct kiocb kiocb; | ||
205 | struct file *swap_file = sis->swap_file; | ||
206 | struct address_space *mapping = swap_file->f_mapping; | ||
207 | struct iovec iov = { | ||
208 | .iov_base = kmap(page), | ||
209 | .iov_len = PAGE_SIZE, | ||
210 | }; | ||
211 | |||
212 | init_sync_kiocb(&kiocb, swap_file); | ||
213 | kiocb.ki_pos = page_file_offset(page); | ||
214 | kiocb.ki_left = PAGE_SIZE; | ||
215 | kiocb.ki_nbytes = PAGE_SIZE; | ||
216 | |||
217 | unlock_page(page); | ||
218 | ret = mapping->a_ops->direct_IO(KERNEL_WRITE, | ||
219 | &kiocb, &iov, | ||
220 | kiocb.ki_pos, 1); | ||
221 | kunmap(page); | ||
222 | if (ret == PAGE_SIZE) { | ||
223 | count_vm_event(PSWPOUT); | ||
224 | ret = 0; | ||
225 | } | ||
226 | return ret; | ||
227 | } | ||
228 | |||
108 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); | 229 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); |
109 | if (bio == NULL) { | 230 | if (bio == NULL) { |
110 | set_page_dirty(page); | 231 | set_page_dirty(page); |
@@ -126,6 +247,7 @@ int swap_readpage(struct page *page) | |||
126 | { | 247 | { |
127 | struct bio *bio; | 248 | struct bio *bio; |
128 | int ret = 0; | 249 | int ret = 0; |
250 | struct swap_info_struct *sis = page_swap_info(page); | ||
129 | 251 | ||
130 | VM_BUG_ON(!PageLocked(page)); | 252 | VM_BUG_ON(!PageLocked(page)); |
131 | VM_BUG_ON(PageUptodate(page)); | 253 | VM_BUG_ON(PageUptodate(page)); |
@@ -134,6 +256,17 @@ int swap_readpage(struct page *page) | |||
134 | unlock_page(page); | 256 | unlock_page(page); |
135 | goto out; | 257 | goto out; |
136 | } | 258 | } |
259 | |||
260 | if (sis->flags & SWP_FILE) { | ||
261 | struct file *swap_file = sis->swap_file; | ||
262 | struct address_space *mapping = swap_file->f_mapping; | ||
263 | |||
264 | ret = mapping->a_ops->readpage(swap_file, page); | ||
265 | if (!ret) | ||
266 | count_vm_event(PSWPIN); | ||
267 | return ret; | ||
268 | } | ||
269 | |||
137 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); | 270 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
138 | if (bio == NULL) { | 271 | if (bio == NULL) { |
139 | unlock_page(page); | 272 | unlock_page(page); |
@@ -145,3 +278,15 @@ int swap_readpage(struct page *page) | |||
145 | out: | 278 | out: |
146 | return ret; | 279 | return ret; |
147 | } | 280 | } |
281 | |||
282 | int swap_set_page_dirty(struct page *page) | ||
283 | { | ||
284 | struct swap_info_struct *sis = page_swap_info(page); | ||
285 | |||
286 | if (sis->flags & SWP_FILE) { | ||
287 | struct address_space *mapping = sis->swap_file->f_mapping; | ||
288 | return mapping->a_ops->set_page_dirty(page); | ||
289 | } else { | ||
290 | return __set_page_dirty_no_writeback(page); | ||
291 | } | ||
292 | } | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c9f04774f2b8..247d1f175739 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -5,8 +5,101 @@ | |||
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/page-isolation.h> | 6 | #include <linux/page-isolation.h> |
7 | #include <linux/pageblock-flags.h> | 7 | #include <linux/pageblock-flags.h> |
8 | #include <linux/memory.h> | ||
8 | #include "internal.h" | 9 | #include "internal.h" |
9 | 10 | ||
11 | /* called while holding zone->lock */ | ||
12 | static void set_pageblock_isolate(struct page *page) | ||
13 | { | ||
14 | if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE) | ||
15 | return; | ||
16 | |||
17 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
18 | page_zone(page)->nr_pageblock_isolate++; | ||
19 | } | ||
20 | |||
21 | /* called while holding zone->lock */ | ||
22 | static void restore_pageblock_isolate(struct page *page, int migratetype) | ||
23 | { | ||
24 | struct zone *zone = page_zone(page); | ||
25 | if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) | ||
26 | return; | ||
27 | |||
28 | BUG_ON(zone->nr_pageblock_isolate <= 0); | ||
29 | set_pageblock_migratetype(page, migratetype); | ||
30 | zone->nr_pageblock_isolate--; | ||
31 | } | ||
32 | |||
33 | int set_migratetype_isolate(struct page *page) | ||
34 | { | ||
35 | struct zone *zone; | ||
36 | unsigned long flags, pfn; | ||
37 | struct memory_isolate_notify arg; | ||
38 | int notifier_ret; | ||
39 | int ret = -EBUSY; | ||
40 | |||
41 | zone = page_zone(page); | ||
42 | |||
43 | spin_lock_irqsave(&zone->lock, flags); | ||
44 | |||
45 | pfn = page_to_pfn(page); | ||
46 | arg.start_pfn = pfn; | ||
47 | arg.nr_pages = pageblock_nr_pages; | ||
48 | arg.pages_found = 0; | ||
49 | |||
50 | /* | ||
51 | * It may be possible to isolate a pageblock even if the | ||
52 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | ||
53 | * notifier chain is used by balloon drivers to return the | ||
54 | * number of pages in a range that are held by the balloon | ||
55 | * driver to shrink memory. If all the pages are accounted for | ||
56 | * by balloons, are free, or on the LRU, isolation can continue. | ||
57 | * Later, for example, when memory hotplug notifier runs, these | ||
58 | * pages reported as "can be isolated" should be isolated(freed) | ||
59 | * by the balloon driver through the memory notifier chain. | ||
60 | */ | ||
61 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); | ||
62 | notifier_ret = notifier_to_errno(notifier_ret); | ||
63 | if (notifier_ret) | ||
64 | goto out; | ||
65 | /* | ||
66 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | ||
67 | * We just check MOVABLE pages. | ||
68 | */ | ||
69 | if (!has_unmovable_pages(zone, page, arg.pages_found)) | ||
70 | ret = 0; | ||
71 | |||
72 | /* | ||
73 | * immobile means "not-on-lru" paes. If immobile is larger than | ||
74 | * removable-by-driver pages reported by notifier, we'll fail. | ||
75 | */ | ||
76 | |||
77 | out: | ||
78 | if (!ret) { | ||
79 | set_pageblock_isolate(page); | ||
80 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
81 | } | ||
82 | |||
83 | spin_unlock_irqrestore(&zone->lock, flags); | ||
84 | if (!ret) | ||
85 | drain_all_pages(); | ||
86 | return ret; | ||
87 | } | ||
88 | |||
89 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) | ||
90 | { | ||
91 | struct zone *zone; | ||
92 | unsigned long flags; | ||
93 | zone = page_zone(page); | ||
94 | spin_lock_irqsave(&zone->lock, flags); | ||
95 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
96 | goto out; | ||
97 | move_freepages_block(zone, page, migratetype); | ||
98 | restore_pageblock_isolate(page, migratetype); | ||
99 | out: | ||
100 | spin_unlock_irqrestore(&zone->lock, flags); | ||
101 | } | ||
102 | |||
10 | static inline struct page * | 103 | static inline struct page * |
11 | __first_valid_page(unsigned long pfn, unsigned long nr_pages) | 104 | __first_valid_page(unsigned long pfn, unsigned long nr_pages) |
12 | { | 105 | { |
diff --git a/mm/shmem.c b/mm/shmem.c index c15b998e5a86..d4e184e2a38e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -929,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, | |||
929 | 929 | ||
930 | /* Create a pseudo vma that just contains the policy */ | 930 | /* Create a pseudo vma that just contains the policy */ |
931 | pvma.vm_start = 0; | 931 | pvma.vm_start = 0; |
932 | pvma.vm_pgoff = index; | 932 | /* Bias interleave by inode number to distribute better across nodes */ |
933 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; | ||
933 | pvma.vm_ops = NULL; | 934 | pvma.vm_ops = NULL; |
934 | pvma.vm_policy = spol; | 935 | pvma.vm_policy = spol; |
935 | return swapin_readahead(swap, gfp, &pvma, 0); | 936 | return swapin_readahead(swap, gfp, &pvma, 0); |
@@ -942,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
942 | 943 | ||
943 | /* Create a pseudo vma that just contains the policy */ | 944 | /* Create a pseudo vma that just contains the policy */ |
944 | pvma.vm_start = 0; | 945 | pvma.vm_start = 0; |
945 | pvma.vm_pgoff = index; | 946 | /* Bias interleave by inode number to distribute better across nodes */ |
947 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; | ||
946 | pvma.vm_ops = NULL; | 948 | pvma.vm_ops = NULL; |
947 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); | 949 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); |
948 | 950 | ||
@@ -118,12 +118,16 @@ | |||
118 | #include <linux/memory.h> | 118 | #include <linux/memory.h> |
119 | #include <linux/prefetch.h> | 119 | #include <linux/prefetch.h> |
120 | 120 | ||
121 | #include <net/sock.h> | ||
122 | |||
121 | #include <asm/cacheflush.h> | 123 | #include <asm/cacheflush.h> |
122 | #include <asm/tlbflush.h> | 124 | #include <asm/tlbflush.h> |
123 | #include <asm/page.h> | 125 | #include <asm/page.h> |
124 | 126 | ||
125 | #include <trace/events/kmem.h> | 127 | #include <trace/events/kmem.h> |
126 | 128 | ||
129 | #include "internal.h" | ||
130 | |||
127 | /* | 131 | /* |
128 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. | 132 | * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. |
129 | * 0 for faster, smaller code (especially in the critical paths). | 133 | * 0 for faster, smaller code (especially in the critical paths). |
@@ -152,6 +156,12 @@ | |||
152 | #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN | 156 | #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN |
153 | #endif | 157 | #endif |
154 | 158 | ||
159 | /* | ||
160 | * true if a page was allocated from pfmemalloc reserves for network-based | ||
161 | * swap | ||
162 | */ | ||
163 | static bool pfmemalloc_active __read_mostly; | ||
164 | |||
155 | /* Legal flag mask for kmem_cache_create(). */ | 165 | /* Legal flag mask for kmem_cache_create(). */ |
156 | #if DEBUG | 166 | #if DEBUG |
157 | # define CREATE_MASK (SLAB_RED_ZONE | \ | 167 | # define CREATE_MASK (SLAB_RED_ZONE | \ |
@@ -257,9 +267,30 @@ struct array_cache { | |||
257 | * Must have this definition in here for the proper | 267 | * Must have this definition in here for the proper |
258 | * alignment of array_cache. Also simplifies accessing | 268 | * alignment of array_cache. Also simplifies accessing |
259 | * the entries. | 269 | * the entries. |
270 | * | ||
271 | * Entries should not be directly dereferenced as | ||
272 | * entries belonging to slabs marked pfmemalloc will | ||
273 | * have the lower bits set SLAB_OBJ_PFMEMALLOC | ||
260 | */ | 274 | */ |
261 | }; | 275 | }; |
262 | 276 | ||
277 | #define SLAB_OBJ_PFMEMALLOC 1 | ||
278 | static inline bool is_obj_pfmemalloc(void *objp) | ||
279 | { | ||
280 | return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; | ||
281 | } | ||
282 | |||
283 | static inline void set_obj_pfmemalloc(void **objp) | ||
284 | { | ||
285 | *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); | ||
286 | return; | ||
287 | } | ||
288 | |||
289 | static inline void clear_obj_pfmemalloc(void **objp) | ||
290 | { | ||
291 | *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); | ||
292 | } | ||
293 | |||
263 | /* | 294 | /* |
264 | * bootstrap: The caches do not work without cpuarrays anymore, but the | 295 | * bootstrap: The caches do not work without cpuarrays anymore, but the |
265 | * cpuarrays are allocated from the generic caches... | 296 | * cpuarrays are allocated from the generic caches... |
@@ -900,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries, | |||
900 | return nc; | 931 | return nc; |
901 | } | 932 | } |
902 | 933 | ||
934 | static inline bool is_slab_pfmemalloc(struct slab *slabp) | ||
935 | { | ||
936 | struct page *page = virt_to_page(slabp->s_mem); | ||
937 | |||
938 | return PageSlabPfmemalloc(page); | ||
939 | } | ||
940 | |||
941 | /* Clears pfmemalloc_active if no slabs have pfmalloc set */ | ||
942 | static void recheck_pfmemalloc_active(struct kmem_cache *cachep, | ||
943 | struct array_cache *ac) | ||
944 | { | ||
945 | struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()]; | ||
946 | struct slab *slabp; | ||
947 | unsigned long flags; | ||
948 | |||
949 | if (!pfmemalloc_active) | ||
950 | return; | ||
951 | |||
952 | spin_lock_irqsave(&l3->list_lock, flags); | ||
953 | list_for_each_entry(slabp, &l3->slabs_full, list) | ||
954 | if (is_slab_pfmemalloc(slabp)) | ||
955 | goto out; | ||
956 | |||
957 | list_for_each_entry(slabp, &l3->slabs_partial, list) | ||
958 | if (is_slab_pfmemalloc(slabp)) | ||
959 | goto out; | ||
960 | |||
961 | list_for_each_entry(slabp, &l3->slabs_free, list) | ||
962 | if (is_slab_pfmemalloc(slabp)) | ||
963 | goto out; | ||
964 | |||
965 | pfmemalloc_active = false; | ||
966 | out: | ||
967 | spin_unlock_irqrestore(&l3->list_lock, flags); | ||
968 | } | ||
969 | |||
970 | static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, | ||
971 | gfp_t flags, bool force_refill) | ||
972 | { | ||
973 | int i; | ||
974 | void *objp = ac->entry[--ac->avail]; | ||
975 | |||
976 | /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ | ||
977 | if (unlikely(is_obj_pfmemalloc(objp))) { | ||
978 | struct kmem_list3 *l3; | ||
979 | |||
980 | if (gfp_pfmemalloc_allowed(flags)) { | ||
981 | clear_obj_pfmemalloc(&objp); | ||
982 | return objp; | ||
983 | } | ||
984 | |||
985 | /* The caller cannot use PFMEMALLOC objects, find another one */ | ||
986 | for (i = 1; i < ac->avail; i++) { | ||
987 | /* If a !PFMEMALLOC object is found, swap them */ | ||
988 | if (!is_obj_pfmemalloc(ac->entry[i])) { | ||
989 | objp = ac->entry[i]; | ||
990 | ac->entry[i] = ac->entry[ac->avail]; | ||
991 | ac->entry[ac->avail] = objp; | ||
992 | return objp; | ||
993 | } | ||
994 | } | ||
995 | |||
996 | /* | ||
997 | * If there are empty slabs on the slabs_free list and we are | ||
998 | * being forced to refill the cache, mark this one !pfmemalloc. | ||
999 | */ | ||
1000 | l3 = cachep->nodelists[numa_mem_id()]; | ||
1001 | if (!list_empty(&l3->slabs_free) && force_refill) { | ||
1002 | struct slab *slabp = virt_to_slab(objp); | ||
1003 | ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem)); | ||
1004 | clear_obj_pfmemalloc(&objp); | ||
1005 | recheck_pfmemalloc_active(cachep, ac); | ||
1006 | return objp; | ||
1007 | } | ||
1008 | |||
1009 | /* No !PFMEMALLOC objects available */ | ||
1010 | ac->avail++; | ||
1011 | objp = NULL; | ||
1012 | } | ||
1013 | |||
1014 | return objp; | ||
1015 | } | ||
1016 | |||
1017 | static inline void *ac_get_obj(struct kmem_cache *cachep, | ||
1018 | struct array_cache *ac, gfp_t flags, bool force_refill) | ||
1019 | { | ||
1020 | void *objp; | ||
1021 | |||
1022 | if (unlikely(sk_memalloc_socks())) | ||
1023 | objp = __ac_get_obj(cachep, ac, flags, force_refill); | ||
1024 | else | ||
1025 | objp = ac->entry[--ac->avail]; | ||
1026 | |||
1027 | return objp; | ||
1028 | } | ||
1029 | |||
1030 | static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | ||
1031 | void *objp) | ||
1032 | { | ||
1033 | if (unlikely(pfmemalloc_active)) { | ||
1034 | /* Some pfmemalloc slabs exist, check if this is one */ | ||
1035 | struct page *page = virt_to_page(objp); | ||
1036 | if (PageSlabPfmemalloc(page)) | ||
1037 | set_obj_pfmemalloc(&objp); | ||
1038 | } | ||
1039 | |||
1040 | return objp; | ||
1041 | } | ||
1042 | |||
1043 | static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | ||
1044 | void *objp) | ||
1045 | { | ||
1046 | if (unlikely(sk_memalloc_socks())) | ||
1047 | objp = __ac_put_obj(cachep, ac, objp); | ||
1048 | |||
1049 | ac->entry[ac->avail++] = objp; | ||
1050 | } | ||
1051 | |||
903 | /* | 1052 | /* |
904 | * Transfer objects in one arraycache to another. | 1053 | * Transfer objects in one arraycache to another. |
905 | * Locking must be handled by the caller. | 1054 | * Locking must be handled by the caller. |
@@ -1076,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
1076 | STATS_INC_ACOVERFLOW(cachep); | 1225 | STATS_INC_ACOVERFLOW(cachep); |
1077 | __drain_alien_cache(cachep, alien, nodeid); | 1226 | __drain_alien_cache(cachep, alien, nodeid); |
1078 | } | 1227 | } |
1079 | alien->entry[alien->avail++] = objp; | 1228 | ac_put_obj(cachep, alien, objp); |
1080 | spin_unlock(&alien->lock); | 1229 | spin_unlock(&alien->lock); |
1081 | } else { | 1230 | } else { |
1082 | spin_lock(&(cachep->nodelists[nodeid])->list_lock); | 1231 | spin_lock(&(cachep->nodelists[nodeid])->list_lock); |
@@ -1759,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1759 | return NULL; | 1908 | return NULL; |
1760 | } | 1909 | } |
1761 | 1910 | ||
1911 | /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ | ||
1912 | if (unlikely(page->pfmemalloc)) | ||
1913 | pfmemalloc_active = true; | ||
1914 | |||
1762 | nr_pages = (1 << cachep->gfporder); | 1915 | nr_pages = (1 << cachep->gfporder); |
1763 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1916 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
1764 | add_zone_page_state(page_zone(page), | 1917 | add_zone_page_state(page_zone(page), |
@@ -1766,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1766 | else | 1919 | else |
1767 | add_zone_page_state(page_zone(page), | 1920 | add_zone_page_state(page_zone(page), |
1768 | NR_SLAB_UNRECLAIMABLE, nr_pages); | 1921 | NR_SLAB_UNRECLAIMABLE, nr_pages); |
1769 | for (i = 0; i < nr_pages; i++) | 1922 | for (i = 0; i < nr_pages; i++) { |
1770 | __SetPageSlab(page + i); | 1923 | __SetPageSlab(page + i); |
1771 | 1924 | ||
1925 | if (page->pfmemalloc) | ||
1926 | SetPageSlabPfmemalloc(page + i); | ||
1927 | } | ||
1928 | |||
1772 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | 1929 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { |
1773 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | 1930 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); |
1774 | 1931 | ||
@@ -1800,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1800 | NR_SLAB_UNRECLAIMABLE, nr_freed); | 1957 | NR_SLAB_UNRECLAIMABLE, nr_freed); |
1801 | while (i--) { | 1958 | while (i--) { |
1802 | BUG_ON(!PageSlab(page)); | 1959 | BUG_ON(!PageSlab(page)); |
1960 | __ClearPageSlabPfmemalloc(page); | ||
1803 | __ClearPageSlab(page); | 1961 | __ClearPageSlab(page); |
1804 | page++; | 1962 | page++; |
1805 | } | 1963 | } |
@@ -3015,16 +3173,19 @@ bad: | |||
3015 | #define check_slabp(x,y) do { } while(0) | 3173 | #define check_slabp(x,y) do { } while(0) |
3016 | #endif | 3174 | #endif |
3017 | 3175 | ||
3018 | static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) | 3176 | static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, |
3177 | bool force_refill) | ||
3019 | { | 3178 | { |
3020 | int batchcount; | 3179 | int batchcount; |
3021 | struct kmem_list3 *l3; | 3180 | struct kmem_list3 *l3; |
3022 | struct array_cache *ac; | 3181 | struct array_cache *ac; |
3023 | int node; | 3182 | int node; |
3024 | 3183 | ||
3025 | retry: | ||
3026 | check_irq_off(); | 3184 | check_irq_off(); |
3027 | node = numa_mem_id(); | 3185 | node = numa_mem_id(); |
3186 | if (unlikely(force_refill)) | ||
3187 | goto force_grow; | ||
3188 | retry: | ||
3028 | ac = cpu_cache_get(cachep); | 3189 | ac = cpu_cache_get(cachep); |
3029 | batchcount = ac->batchcount; | 3190 | batchcount = ac->batchcount; |
3030 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { | 3191 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { |
@@ -3074,8 +3235,8 @@ retry: | |||
3074 | STATS_INC_ACTIVE(cachep); | 3235 | STATS_INC_ACTIVE(cachep); |
3075 | STATS_SET_HIGH(cachep); | 3236 | STATS_SET_HIGH(cachep); |
3076 | 3237 | ||
3077 | ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, | 3238 | ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, |
3078 | node); | 3239 | node)); |
3079 | } | 3240 | } |
3080 | check_slabp(cachep, slabp); | 3241 | check_slabp(cachep, slabp); |
3081 | 3242 | ||
@@ -3094,18 +3255,22 @@ alloc_done: | |||
3094 | 3255 | ||
3095 | if (unlikely(!ac->avail)) { | 3256 | if (unlikely(!ac->avail)) { |
3096 | int x; | 3257 | int x; |
3258 | force_grow: | ||
3097 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); | 3259 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); |
3098 | 3260 | ||
3099 | /* cache_grow can reenable interrupts, then ac could change. */ | 3261 | /* cache_grow can reenable interrupts, then ac could change. */ |
3100 | ac = cpu_cache_get(cachep); | 3262 | ac = cpu_cache_get(cachep); |
3101 | if (!x && ac->avail == 0) /* no objects in sight? abort */ | 3263 | |
3264 | /* no objects in sight? abort */ | ||
3265 | if (!x && (ac->avail == 0 || force_refill)) | ||
3102 | return NULL; | 3266 | return NULL; |
3103 | 3267 | ||
3104 | if (!ac->avail) /* objects refilled by interrupt? */ | 3268 | if (!ac->avail) /* objects refilled by interrupt? */ |
3105 | goto retry; | 3269 | goto retry; |
3106 | } | 3270 | } |
3107 | ac->touched = 1; | 3271 | ac->touched = 1; |
3108 | return ac->entry[--ac->avail]; | 3272 | |
3273 | return ac_get_obj(cachep, ac, flags, force_refill); | ||
3109 | } | 3274 | } |
3110 | 3275 | ||
3111 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, | 3276 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, |
@@ -3187,23 +3352,35 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3187 | { | 3352 | { |
3188 | void *objp; | 3353 | void *objp; |
3189 | struct array_cache *ac; | 3354 | struct array_cache *ac; |
3355 | bool force_refill = false; | ||
3190 | 3356 | ||
3191 | check_irq_off(); | 3357 | check_irq_off(); |
3192 | 3358 | ||
3193 | ac = cpu_cache_get(cachep); | 3359 | ac = cpu_cache_get(cachep); |
3194 | if (likely(ac->avail)) { | 3360 | if (likely(ac->avail)) { |
3195 | STATS_INC_ALLOCHIT(cachep); | ||
3196 | ac->touched = 1; | 3361 | ac->touched = 1; |
3197 | objp = ac->entry[--ac->avail]; | 3362 | objp = ac_get_obj(cachep, ac, flags, false); |
3198 | } else { | 3363 | |
3199 | STATS_INC_ALLOCMISS(cachep); | ||
3200 | objp = cache_alloc_refill(cachep, flags); | ||
3201 | /* | 3364 | /* |
3202 | * the 'ac' may be updated by cache_alloc_refill(), | 3365 | * Allow for the possibility all avail objects are not allowed |
3203 | * and kmemleak_erase() requires its correct value. | 3366 | * by the current flags |
3204 | */ | 3367 | */ |
3205 | ac = cpu_cache_get(cachep); | 3368 | if (objp) { |
3369 | STATS_INC_ALLOCHIT(cachep); | ||
3370 | goto out; | ||
3371 | } | ||
3372 | force_refill = true; | ||
3206 | } | 3373 | } |
3374 | |||
3375 | STATS_INC_ALLOCMISS(cachep); | ||
3376 | objp = cache_alloc_refill(cachep, flags, force_refill); | ||
3377 | /* | ||
3378 | * the 'ac' may be updated by cache_alloc_refill(), | ||
3379 | * and kmemleak_erase() requires its correct value. | ||
3380 | */ | ||
3381 | ac = cpu_cache_get(cachep); | ||
3382 | |||
3383 | out: | ||
3207 | /* | 3384 | /* |
3208 | * To avoid a false negative, if an object that is in one of the | 3385 | * To avoid a false negative, if an object that is in one of the |
3209 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't | 3386 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't |
@@ -3525,9 +3702,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, | |||
3525 | struct kmem_list3 *l3; | 3702 | struct kmem_list3 *l3; |
3526 | 3703 | ||
3527 | for (i = 0; i < nr_objects; i++) { | 3704 | for (i = 0; i < nr_objects; i++) { |
3528 | void *objp = objpp[i]; | 3705 | void *objp; |
3529 | struct slab *slabp; | 3706 | struct slab *slabp; |
3530 | 3707 | ||
3708 | clear_obj_pfmemalloc(&objpp[i]); | ||
3709 | objp = objpp[i]; | ||
3710 | |||
3531 | slabp = virt_to_slab(objp); | 3711 | slabp = virt_to_slab(objp); |
3532 | l3 = cachep->nodelists[node]; | 3712 | l3 = cachep->nodelists[node]; |
3533 | list_del(&slabp->list); | 3713 | list_del(&slabp->list); |
@@ -3645,7 +3825,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, | |||
3645 | cache_flusharray(cachep, ac); | 3825 | cache_flusharray(cachep, ac); |
3646 | } | 3826 | } |
3647 | 3827 | ||
3648 | ac->entry[ac->avail++] = objp; | 3828 | ac_put_obj(cachep, ac, objp); |
3649 | } | 3829 | } |
3650 | 3830 | ||
3651 | /** | 3831 | /** |
@@ -34,6 +34,8 @@ | |||
34 | 34 | ||
35 | #include <trace/events/kmem.h> | 35 | #include <trace/events/kmem.h> |
36 | 36 | ||
37 | #include "internal.h" | ||
38 | |||
37 | /* | 39 | /* |
38 | * Lock order: | 40 | * Lock order: |
39 | * 1. slab_mutex (Global Mutex) | 41 | * 1. slab_mutex (Global Mutex) |
@@ -1354,6 +1356,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1354 | inc_slabs_node(s, page_to_nid(page), page->objects); | 1356 | inc_slabs_node(s, page_to_nid(page), page->objects); |
1355 | page->slab = s; | 1357 | page->slab = s; |
1356 | __SetPageSlab(page); | 1358 | __SetPageSlab(page); |
1359 | if (page->pfmemalloc) | ||
1360 | SetPageSlabPfmemalloc(page); | ||
1357 | 1361 | ||
1358 | start = page_address(page); | 1362 | start = page_address(page); |
1359 | 1363 | ||
@@ -1397,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1397 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1401 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
1398 | -pages); | 1402 | -pages); |
1399 | 1403 | ||
1404 | __ClearPageSlabPfmemalloc(page); | ||
1400 | __ClearPageSlab(page); | 1405 | __ClearPageSlab(page); |
1401 | reset_page_mapcount(page); | 1406 | reset_page_mapcount(page); |
1402 | if (current->reclaim_state) | 1407 | if (current->reclaim_state) |
@@ -2126,6 +2131,14 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | |||
2126 | return freelist; | 2131 | return freelist; |
2127 | } | 2132 | } |
2128 | 2133 | ||
2134 | static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) | ||
2135 | { | ||
2136 | if (unlikely(PageSlabPfmemalloc(page))) | ||
2137 | return gfp_pfmemalloc_allowed(gfpflags); | ||
2138 | |||
2139 | return true; | ||
2140 | } | ||
2141 | |||
2129 | /* | 2142 | /* |
2130 | * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist | 2143 | * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist |
2131 | * or deactivate the page. | 2144 | * or deactivate the page. |
@@ -2206,6 +2219,18 @@ redo: | |||
2206 | goto new_slab; | 2219 | goto new_slab; |
2207 | } | 2220 | } |
2208 | 2221 | ||
2222 | /* | ||
2223 | * By rights, we should be searching for a slab page that was | ||
2224 | * PFMEMALLOC but right now, we are losing the pfmemalloc | ||
2225 | * information when the page leaves the per-cpu allocator | ||
2226 | */ | ||
2227 | if (unlikely(!pfmemalloc_match(page, gfpflags))) { | ||
2228 | deactivate_slab(s, page, c->freelist); | ||
2229 | c->page = NULL; | ||
2230 | c->freelist = NULL; | ||
2231 | goto new_slab; | ||
2232 | } | ||
2233 | |||
2209 | /* must check again c->freelist in case of cpu migration or IRQ */ | 2234 | /* must check again c->freelist in case of cpu migration or IRQ */ |
2210 | freelist = c->freelist; | 2235 | freelist = c->freelist; |
2211 | if (freelist) | 2236 | if (freelist) |
@@ -2256,11 +2281,11 @@ new_slab: | |||
2256 | } | 2281 | } |
2257 | 2282 | ||
2258 | page = c->page; | 2283 | page = c->page; |
2259 | if (likely(!kmem_cache_debug(s))) | 2284 | if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) |
2260 | goto load_freelist; | 2285 | goto load_freelist; |
2261 | 2286 | ||
2262 | /* Only entered in the debug case */ | 2287 | /* Only entered in the debug case */ |
2263 | if (!alloc_debug_processing(s, page, freelist, addr)) | 2288 | if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr)) |
2264 | goto new_slab; /* Slab failed checks. Next slab needed */ | 2289 | goto new_slab; /* Slab failed checks. Next slab needed */ |
2265 | 2290 | ||
2266 | deactivate_slab(s, page, get_freepointer(s, freelist)); | 2291 | deactivate_slab(s, page, get_freepointer(s, freelist)); |
@@ -2313,7 +2338,6 @@ redo: | |||
2313 | object = c->freelist; | 2338 | object = c->freelist; |
2314 | page = c->page; | 2339 | page = c->page; |
2315 | if (unlikely(!object || !node_match(page, node))) | 2340 | if (unlikely(!object || !node_match(page, node))) |
2316 | |||
2317 | object = __slab_alloc(s, gfpflags, node, addr, c); | 2341 | object = __slab_alloc(s, gfpflags, node, addr, c); |
2318 | 2342 | ||
2319 | else { | 2343 | else { |
diff --git a/mm/sparse.c b/mm/sparse.c index c7bb952400c8..fac95f2888f2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) | |||
65 | 65 | ||
66 | if (slab_is_available()) { | 66 | if (slab_is_available()) { |
67 | if (node_state(nid, N_HIGH_MEMORY)) | 67 | if (node_state(nid, N_HIGH_MEMORY)) |
68 | section = kmalloc_node(array_size, GFP_KERNEL, nid); | 68 | section = kzalloc_node(array_size, GFP_KERNEL, nid); |
69 | else | 69 | else |
70 | section = kmalloc(array_size, GFP_KERNEL); | 70 | section = kzalloc(array_size, GFP_KERNEL); |
71 | } else | 71 | } else { |
72 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); | 72 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); |
73 | 73 | } | |
74 | if (section) | ||
75 | memset(section, 0, array_size); | ||
76 | 74 | ||
77 | return section; | 75 | return section; |
78 | } | 76 | } |
79 | 77 | ||
80 | static int __meminit sparse_index_init(unsigned long section_nr, int nid) | 78 | static int __meminit sparse_index_init(unsigned long section_nr, int nid) |
81 | { | 79 | { |
82 | static DEFINE_SPINLOCK(index_init_lock); | ||
83 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); | 80 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); |
84 | struct mem_section *section; | 81 | struct mem_section *section; |
85 | int ret = 0; | 82 | int ret = 0; |
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) | |||
90 | section = sparse_index_alloc(nid); | 87 | section = sparse_index_alloc(nid); |
91 | if (!section) | 88 | if (!section) |
92 | return -ENOMEM; | 89 | return -ENOMEM; |
93 | /* | ||
94 | * This lock keeps two different sections from | ||
95 | * reallocating for the same index | ||
96 | */ | ||
97 | spin_lock(&index_init_lock); | ||
98 | |||
99 | if (mem_section[root]) { | ||
100 | ret = -EEXIST; | ||
101 | goto out; | ||
102 | } | ||
103 | 90 | ||
104 | mem_section[root] = section; | 91 | mem_section[root] = section; |
105 | out: | 92 | |
106 | spin_unlock(&index_init_lock); | ||
107 | return ret; | 93 | return ret; |
108 | } | 94 | } |
109 | #else /* !SPARSEMEM_EXTREME */ | 95 | #else /* !SPARSEMEM_EXTREME */ |
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms) | |||
132 | break; | 118 | break; |
133 | } | 119 | } |
134 | 120 | ||
121 | VM_BUG_ON(root_nr == NR_SECTION_ROOTS); | ||
122 | |||
135 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); | 123 | return (root_nr * SECTIONS_PER_ROOT) + (ms - root); |
136 | } | 124 | } |
137 | 125 | ||
@@ -493,6 +481,9 @@ void __init sparse_init(void) | |||
493 | struct page **map_map; | 481 | struct page **map_map; |
494 | #endif | 482 | #endif |
495 | 483 | ||
484 | /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ | ||
485 | set_pageblock_order(); | ||
486 | |||
496 | /* | 487 | /* |
497 | * map is using big page (aka 2M in x86 64 bit) | 488 | * map is using big page (aka 2M in x86 64 bit) |
498 | * usemap is less one page (aka 24 bytes) | 489 | * usemap is less one page (aka 24 bytes) |
@@ -236,6 +236,58 @@ void put_pages_list(struct list_head *pages) | |||
236 | } | 236 | } |
237 | EXPORT_SYMBOL(put_pages_list); | 237 | EXPORT_SYMBOL(put_pages_list); |
238 | 238 | ||
239 | /* | ||
240 | * get_kernel_pages() - pin kernel pages in memory | ||
241 | * @kiov: An array of struct kvec structures | ||
242 | * @nr_segs: number of segments to pin | ||
243 | * @write: pinning for read/write, currently ignored | ||
244 | * @pages: array that receives pointers to the pages pinned. | ||
245 | * Should be at least nr_segs long. | ||
246 | * | ||
247 | * Returns number of pages pinned. This may be fewer than the number | ||
248 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
249 | * were pinned, returns -errno. Each page returned must be released | ||
250 | * with a put_page() call when it is finished with. | ||
251 | */ | ||
252 | int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, | ||
253 | struct page **pages) | ||
254 | { | ||
255 | int seg; | ||
256 | |||
257 | for (seg = 0; seg < nr_segs; seg++) { | ||
258 | if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) | ||
259 | return seg; | ||
260 | |||
261 | pages[seg] = kmap_to_page(kiov[seg].iov_base); | ||
262 | page_cache_get(pages[seg]); | ||
263 | } | ||
264 | |||
265 | return seg; | ||
266 | } | ||
267 | EXPORT_SYMBOL_GPL(get_kernel_pages); | ||
268 | |||
269 | /* | ||
270 | * get_kernel_page() - pin a kernel page in memory | ||
271 | * @start: starting kernel address | ||
272 | * @write: pinning for read/write, currently ignored | ||
273 | * @pages: array that receives pointer to the page pinned. | ||
274 | * Must be at least nr_segs long. | ||
275 | * | ||
276 | * Returns 1 if page is pinned. If the page was not pinned, returns | ||
277 | * -errno. The page returned must be released with a put_page() call | ||
278 | * when it is finished with. | ||
279 | */ | ||
280 | int get_kernel_page(unsigned long start, int write, struct page **pages) | ||
281 | { | ||
282 | const struct kvec kiov = { | ||
283 | .iov_base = (void *)start, | ||
284 | .iov_len = PAGE_SIZE | ||
285 | }; | ||
286 | |||
287 | return get_kernel_pages(&kiov, 1, write, pages); | ||
288 | } | ||
289 | EXPORT_SYMBOL_GPL(get_kernel_page); | ||
290 | |||
239 | static void pagevec_lru_move_fn(struct pagevec *pvec, | 291 | static void pagevec_lru_move_fn(struct pagevec *pvec, |
240 | void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), | 292 | void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), |
241 | void *arg) | 293 | void *arg) |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 4c5ff7f284d9..0cb36fb1f61c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/pagemap.h> | 15 | #include <linux/pagemap.h> |
16 | #include <linux/backing-dev.h> | 16 | #include <linux/backing-dev.h> |
17 | #include <linux/blkdev.h> | ||
17 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
18 | #include <linux/migrate.h> | 19 | #include <linux/migrate.h> |
19 | #include <linux/page_cgroup.h> | 20 | #include <linux/page_cgroup.h> |
@@ -26,7 +27,7 @@ | |||
26 | */ | 27 | */ |
27 | static const struct address_space_operations swap_aops = { | 28 | static const struct address_space_operations swap_aops = { |
28 | .writepage = swap_writepage, | 29 | .writepage = swap_writepage, |
29 | .set_page_dirty = __set_page_dirty_no_writeback, | 30 | .set_page_dirty = swap_set_page_dirty, |
30 | .migratepage = migrate_page, | 31 | .migratepage = migrate_page, |
31 | }; | 32 | }; |
32 | 33 | ||
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
376 | unsigned long offset = swp_offset(entry); | 377 | unsigned long offset = swp_offset(entry); |
377 | unsigned long start_offset, end_offset; | 378 | unsigned long start_offset, end_offset; |
378 | unsigned long mask = (1UL << page_cluster) - 1; | 379 | unsigned long mask = (1UL << page_cluster) - 1; |
380 | struct blk_plug plug; | ||
379 | 381 | ||
380 | /* Read a page_cluster sized and aligned cluster around offset. */ | 382 | /* Read a page_cluster sized and aligned cluster around offset. */ |
381 | start_offset = offset & ~mask; | 383 | start_offset = offset & ~mask; |
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
383 | if (!start_offset) /* First page is swap header. */ | 385 | if (!start_offset) /* First page is swap header. */ |
384 | start_offset++; | 386 | start_offset++; |
385 | 387 | ||
388 | blk_start_plug(&plug); | ||
386 | for (offset = start_offset; offset <= end_offset ; offset++) { | 389 | for (offset = start_offset; offset <= end_offset ; offset++) { |
387 | /* Ok, do the async read-ahead now */ | 390 | /* Ok, do the async read-ahead now */ |
388 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), | 391 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), |
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
391 | continue; | 394 | continue; |
392 | page_cache_release(page); | 395 | page_cache_release(page); |
393 | } | 396 | } |
397 | blk_finish_plug(&plug); | ||
398 | |||
394 | lru_add_drain(); /* Push any new pages onto the LRU now */ | 399 | lru_add_drain(); /* Push any new pages onto the LRU now */ |
395 | return read_swap_cache_async(entry, gfp_mask, vma, addr); | 400 | return read_swap_cache_async(entry, gfp_mask, vma, addr); |
396 | } | 401 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 71373d03fcee..14e254c768fc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/oom.h> | 33 | #include <linux/oom.h> |
34 | #include <linux/frontswap.h> | 34 | #include <linux/frontswap.h> |
35 | #include <linux/swapfile.h> | 35 | #include <linux/swapfile.h> |
36 | #include <linux/export.h> | ||
36 | 37 | ||
37 | #include <asm/pgtable.h> | 38 | #include <asm/pgtable.h> |
38 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
@@ -548,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
548 | 549 | ||
549 | /* free if no reference */ | 550 | /* free if no reference */ |
550 | if (!usage) { | 551 | if (!usage) { |
551 | struct gendisk *disk = p->bdev->bd_disk; | ||
552 | if (offset < p->lowest_bit) | 552 | if (offset < p->lowest_bit) |
553 | p->lowest_bit = offset; | 553 | p->lowest_bit = offset; |
554 | if (offset > p->highest_bit) | 554 | if (offset > p->highest_bit) |
@@ -559,9 +559,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
559 | nr_swap_pages++; | 559 | nr_swap_pages++; |
560 | p->inuse_pages--; | 560 | p->inuse_pages--; |
561 | frontswap_invalidate_page(p->type, offset); | 561 | frontswap_invalidate_page(p->type, offset); |
562 | if ((p->flags & SWP_BLKDEV) && | 562 | if (p->flags & SWP_BLKDEV) { |
563 | disk->fops->swap_slot_free_notify) | 563 | struct gendisk *disk = p->bdev->bd_disk; |
564 | disk->fops->swap_slot_free_notify(p->bdev, offset); | 564 | if (disk->fops->swap_slot_free_notify) |
565 | disk->fops->swap_slot_free_notify(p->bdev, | ||
566 | offset); | ||
567 | } | ||
565 | } | 568 | } |
566 | 569 | ||
567 | return usage; | 570 | return usage; |
@@ -832,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
832 | 835 | ||
833 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 836 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
834 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { | 837 | if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { |
835 | if (ret > 0) | 838 | mem_cgroup_cancel_charge_swapin(memcg); |
836 | mem_cgroup_cancel_charge_swapin(memcg); | ||
837 | ret = 0; | 839 | ret = 0; |
838 | goto out; | 840 | goto out; |
839 | } | 841 | } |
@@ -1328,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis) | |||
1328 | list_del(&se->list); | 1330 | list_del(&se->list); |
1329 | kfree(se); | 1331 | kfree(se); |
1330 | } | 1332 | } |
1333 | |||
1334 | if (sis->flags & SWP_FILE) { | ||
1335 | struct file *swap_file = sis->swap_file; | ||
1336 | struct address_space *mapping = swap_file->f_mapping; | ||
1337 | |||
1338 | sis->flags &= ~SWP_FILE; | ||
1339 | mapping->a_ops->swap_deactivate(swap_file); | ||
1340 | } | ||
1331 | } | 1341 | } |
1332 | 1342 | ||
1333 | /* | 1343 | /* |
@@ -1336,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis) | |||
1336 | * | 1346 | * |
1337 | * This function rather assumes that it is called in ascending page order. | 1347 | * This function rather assumes that it is called in ascending page order. |
1338 | */ | 1348 | */ |
1339 | static int | 1349 | int |
1340 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | 1350 | add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, |
1341 | unsigned long nr_pages, sector_t start_block) | 1351 | unsigned long nr_pages, sector_t start_block) |
1342 | { | 1352 | { |
@@ -1409,98 +1419,28 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
1409 | */ | 1419 | */ |
1410 | static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | 1420 | static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) |
1411 | { | 1421 | { |
1412 | struct inode *inode; | 1422 | struct file *swap_file = sis->swap_file; |
1413 | unsigned blocks_per_page; | 1423 | struct address_space *mapping = swap_file->f_mapping; |
1414 | unsigned long page_no; | 1424 | struct inode *inode = mapping->host; |
1415 | unsigned blkbits; | ||
1416 | sector_t probe_block; | ||
1417 | sector_t last_block; | ||
1418 | sector_t lowest_block = -1; | ||
1419 | sector_t highest_block = 0; | ||
1420 | int nr_extents = 0; | ||
1421 | int ret; | 1425 | int ret; |
1422 | 1426 | ||
1423 | inode = sis->swap_file->f_mapping->host; | ||
1424 | if (S_ISBLK(inode->i_mode)) { | 1427 | if (S_ISBLK(inode->i_mode)) { |
1425 | ret = add_swap_extent(sis, 0, sis->max, 0); | 1428 | ret = add_swap_extent(sis, 0, sis->max, 0); |
1426 | *span = sis->pages; | 1429 | *span = sis->pages; |
1427 | goto out; | 1430 | return ret; |
1428 | } | 1431 | } |
1429 | 1432 | ||
1430 | blkbits = inode->i_blkbits; | 1433 | if (mapping->a_ops->swap_activate) { |
1431 | blocks_per_page = PAGE_SIZE >> blkbits; | 1434 | ret = mapping->a_ops->swap_activate(sis, swap_file, span); |
1432 | 1435 | if (!ret) { | |
1433 | /* | 1436 | sis->flags |= SWP_FILE; |
1434 | * Map all the blocks into the extent list. This code doesn't try | 1437 | ret = add_swap_extent(sis, 0, sis->max, 0); |
1435 | * to be very smart. | 1438 | *span = sis->pages; |
1436 | */ | ||
1437 | probe_block = 0; | ||
1438 | page_no = 0; | ||
1439 | last_block = i_size_read(inode) >> blkbits; | ||
1440 | while ((probe_block + blocks_per_page) <= last_block && | ||
1441 | page_no < sis->max) { | ||
1442 | unsigned block_in_page; | ||
1443 | sector_t first_block; | ||
1444 | |||
1445 | first_block = bmap(inode, probe_block); | ||
1446 | if (first_block == 0) | ||
1447 | goto bad_bmap; | ||
1448 | |||
1449 | /* | ||
1450 | * It must be PAGE_SIZE aligned on-disk | ||
1451 | */ | ||
1452 | if (first_block & (blocks_per_page - 1)) { | ||
1453 | probe_block++; | ||
1454 | goto reprobe; | ||
1455 | } | ||
1456 | |||
1457 | for (block_in_page = 1; block_in_page < blocks_per_page; | ||
1458 | block_in_page++) { | ||
1459 | sector_t block; | ||
1460 | |||
1461 | block = bmap(inode, probe_block + block_in_page); | ||
1462 | if (block == 0) | ||
1463 | goto bad_bmap; | ||
1464 | if (block != first_block + block_in_page) { | ||
1465 | /* Discontiguity */ | ||
1466 | probe_block++; | ||
1467 | goto reprobe; | ||
1468 | } | ||
1469 | } | ||
1470 | |||
1471 | first_block >>= (PAGE_SHIFT - blkbits); | ||
1472 | if (page_no) { /* exclude the header page */ | ||
1473 | if (first_block < lowest_block) | ||
1474 | lowest_block = first_block; | ||
1475 | if (first_block > highest_block) | ||
1476 | highest_block = first_block; | ||
1477 | } | 1439 | } |
1440 | return ret; | ||
1441 | } | ||
1478 | 1442 | ||
1479 | /* | 1443 | return generic_swapfile_activate(sis, swap_file, span); |
1480 | * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks | ||
1481 | */ | ||
1482 | ret = add_swap_extent(sis, page_no, 1, first_block); | ||
1483 | if (ret < 0) | ||
1484 | goto out; | ||
1485 | nr_extents += ret; | ||
1486 | page_no++; | ||
1487 | probe_block += blocks_per_page; | ||
1488 | reprobe: | ||
1489 | continue; | ||
1490 | } | ||
1491 | ret = nr_extents; | ||
1492 | *span = 1 + highest_block - lowest_block; | ||
1493 | if (page_no == 0) | ||
1494 | page_no = 1; /* force Empty message */ | ||
1495 | sis->max = page_no; | ||
1496 | sis->pages = page_no - 1; | ||
1497 | sis->highest_bit = page_no - 1; | ||
1498 | out: | ||
1499 | return ret; | ||
1500 | bad_bmap: | ||
1501 | printk(KERN_ERR "swapon: swapfile has holes\n"); | ||
1502 | ret = -EINVAL; | ||
1503 | goto out; | ||
1504 | } | 1444 | } |
1505 | 1445 | ||
1506 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1446 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
@@ -2285,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry) | |||
2285 | return __swap_duplicate(entry, SWAP_HAS_CACHE); | 2225 | return __swap_duplicate(entry, SWAP_HAS_CACHE); |
2286 | } | 2226 | } |
2287 | 2227 | ||
2228 | struct swap_info_struct *page_swap_info(struct page *page) | ||
2229 | { | ||
2230 | swp_entry_t swap = { .val = page_private(page) }; | ||
2231 | BUG_ON(!PageSwapCache(page)); | ||
2232 | return swap_info[swp_type(swap)]; | ||
2233 | } | ||
2234 | |||
2235 | /* | ||
2236 | * out-of-line __page_file_ methods to avoid include hell. | ||
2237 | */ | ||
2238 | struct address_space *__page_file_mapping(struct page *page) | ||
2239 | { | ||
2240 | VM_BUG_ON(!PageSwapCache(page)); | ||
2241 | return page_swap_info(page)->swap_file->f_mapping; | ||
2242 | } | ||
2243 | EXPORT_SYMBOL_GPL(__page_file_mapping); | ||
2244 | |||
2245 | pgoff_t __page_file_index(struct page *page) | ||
2246 | { | ||
2247 | swp_entry_t swap = { .val = page_private(page) }; | ||
2248 | VM_BUG_ON(!PageSwapCache(page)); | ||
2249 | return swp_offset(swap); | ||
2250 | } | ||
2251 | EXPORT_SYMBOL_GPL(__page_file_index); | ||
2252 | |||
2288 | /* | 2253 | /* |
2289 | * add_swap_count_continuation - called when a swap count is duplicated | 2254 | * add_swap_count_continuation - called when a swap count is duplicated |
2290 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | 2255 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e03f4c7307a5..2bb90b1d241c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -413,11 +413,11 @@ nocache: | |||
413 | if (addr + size - 1 < addr) | 413 | if (addr + size - 1 < addr) |
414 | goto overflow; | 414 | goto overflow; |
415 | 415 | ||
416 | n = rb_next(&first->rb_node); | 416 | if (list_is_last(&first->list, &vmap_area_list)) |
417 | if (n) | ||
418 | first = rb_entry(n, struct vmap_area, rb_node); | ||
419 | else | ||
420 | goto found; | 417 | goto found; |
418 | |||
419 | first = list_entry(first->list.next, | ||
420 | struct vmap_area, list); | ||
421 | } | 421 | } |
422 | 422 | ||
423 | found: | 423 | found: |
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | |||
904 | 904 | ||
905 | BUG_ON(size & ~PAGE_MASK); | 905 | BUG_ON(size & ~PAGE_MASK); |
906 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | 906 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); |
907 | if (WARN_ON(size == 0)) { | ||
908 | /* | ||
909 | * Allocating 0 bytes isn't what caller wants since | ||
910 | * get_order(0) returns funny result. Just warn and terminate | ||
911 | * early. | ||
912 | */ | ||
913 | return NULL; | ||
914 | } | ||
907 | order = get_order(size); | 915 | order = get_order(size); |
908 | 916 | ||
909 | again: | 917 | again: |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 347b3ff2a478..8d01243d9560 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -133,7 +133,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */ | |||
133 | static LIST_HEAD(shrinker_list); | 133 | static LIST_HEAD(shrinker_list); |
134 | static DECLARE_RWSEM(shrinker_rwsem); | 134 | static DECLARE_RWSEM(shrinker_rwsem); |
135 | 135 | ||
136 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 136 | #ifdef CONFIG_MEMCG |
137 | static bool global_reclaim(struct scan_control *sc) | 137 | static bool global_reclaim(struct scan_control *sc) |
138 | { | 138 | { |
139 | return !sc->target_mem_cgroup; | 139 | return !sc->target_mem_cgroup; |
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
687 | 687 | ||
688 | cond_resched(); | 688 | cond_resched(); |
689 | 689 | ||
690 | mem_cgroup_uncharge_start(); | ||
690 | while (!list_empty(page_list)) { | 691 | while (!list_empty(page_list)) { |
691 | enum page_references references; | 692 | enum page_references references; |
692 | struct address_space *mapping; | 693 | struct address_space *mapping; |
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
720 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 721 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
721 | 722 | ||
722 | if (PageWriteback(page)) { | 723 | if (PageWriteback(page)) { |
723 | nr_writeback++; | 724 | /* |
724 | unlock_page(page); | 725 | * memcg doesn't have any dirty pages throttling so we |
725 | goto keep; | 726 | * could easily OOM just because too many pages are in |
727 | * writeback and there is nothing else to reclaim. | ||
728 | * | ||
729 | * Check __GFP_IO, certainly because a loop driver | ||
730 | * thread might enter reclaim, and deadlock if it waits | ||
731 | * on a page for which it is needed to do the write | ||
732 | * (loop masks off __GFP_IO|__GFP_FS for this reason); | ||
733 | * but more thought would probably show more reasons. | ||
734 | * | ||
735 | * Don't require __GFP_FS, since we're not going into | ||
736 | * the FS, just waiting on its writeback completion. | ||
737 | * Worryingly, ext4 gfs2 and xfs allocate pages with | ||
738 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so | ||
739 | * testing may_enter_fs here is liable to OOM on them. | ||
740 | */ | ||
741 | if (global_reclaim(sc) || | ||
742 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { | ||
743 | /* | ||
744 | * This is slightly racy - end_page_writeback() | ||
745 | * might have just cleared PageReclaim, then | ||
746 | * setting PageReclaim here end up interpreted | ||
747 | * as PageReadahead - but that does not matter | ||
748 | * enough to care. What we do want is for this | ||
749 | * page to have PageReclaim set next time memcg | ||
750 | * reclaim reaches the tests above, so it will | ||
751 | * then wait_on_page_writeback() to avoid OOM; | ||
752 | * and it's also appropriate in global reclaim. | ||
753 | */ | ||
754 | SetPageReclaim(page); | ||
755 | nr_writeback++; | ||
756 | goto keep_locked; | ||
757 | } | ||
758 | wait_on_page_writeback(page); | ||
726 | } | 759 | } |
727 | 760 | ||
728 | references = page_check_references(page, sc); | 761 | references = page_check_references(page, sc); |
@@ -921,6 +954,7 @@ keep: | |||
921 | 954 | ||
922 | list_splice(&ret_pages, page_list); | 955 | list_splice(&ret_pages, page_list); |
923 | count_vm_events(PGACTIVATE, pgactivate); | 956 | count_vm_events(PGACTIVATE, pgactivate); |
957 | mem_cgroup_uncharge_end(); | ||
924 | *ret_nr_dirty += nr_dirty; | 958 | *ret_nr_dirty += nr_dirty; |
925 | *ret_nr_writeback += nr_writeback; | 959 | *ret_nr_writeback += nr_writeback; |
926 | return nr_reclaimed; | 960 | return nr_reclaimed; |
@@ -2112,6 +2146,83 @@ out: | |||
2112 | return 0; | 2146 | return 0; |
2113 | } | 2147 | } |
2114 | 2148 | ||
2149 | static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | ||
2150 | { | ||
2151 | struct zone *zone; | ||
2152 | unsigned long pfmemalloc_reserve = 0; | ||
2153 | unsigned long free_pages = 0; | ||
2154 | int i; | ||
2155 | bool wmark_ok; | ||
2156 | |||
2157 | for (i = 0; i <= ZONE_NORMAL; i++) { | ||
2158 | zone = &pgdat->node_zones[i]; | ||
2159 | pfmemalloc_reserve += min_wmark_pages(zone); | ||
2160 | free_pages += zone_page_state(zone, NR_FREE_PAGES); | ||
2161 | } | ||
2162 | |||
2163 | wmark_ok = free_pages > pfmemalloc_reserve / 2; | ||
2164 | |||
2165 | /* kswapd must be awake if processes are being throttled */ | ||
2166 | if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { | ||
2167 | pgdat->classzone_idx = min(pgdat->classzone_idx, | ||
2168 | (enum zone_type)ZONE_NORMAL); | ||
2169 | wake_up_interruptible(&pgdat->kswapd_wait); | ||
2170 | } | ||
2171 | |||
2172 | return wmark_ok; | ||
2173 | } | ||
2174 | |||
2175 | /* | ||
2176 | * Throttle direct reclaimers if backing storage is backed by the network | ||
2177 | * and the PFMEMALLOC reserve for the preferred node is getting dangerously | ||
2178 | * depleted. kswapd will continue to make progress and wake the processes | ||
2179 | * when the low watermark is reached | ||
2180 | */ | ||
2181 | static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | ||
2182 | nodemask_t *nodemask) | ||
2183 | { | ||
2184 | struct zone *zone; | ||
2185 | int high_zoneidx = gfp_zone(gfp_mask); | ||
2186 | pg_data_t *pgdat; | ||
2187 | |||
2188 | /* | ||
2189 | * Kernel threads should not be throttled as they may be indirectly | ||
2190 | * responsible for cleaning pages necessary for reclaim to make forward | ||
2191 | * progress. kjournald for example may enter direct reclaim while | ||
2192 | * committing a transaction where throttling it could forcing other | ||
2193 | * processes to block on log_wait_commit(). | ||
2194 | */ | ||
2195 | if (current->flags & PF_KTHREAD) | ||
2196 | return; | ||
2197 | |||
2198 | /* Check if the pfmemalloc reserves are ok */ | ||
2199 | first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); | ||
2200 | pgdat = zone->zone_pgdat; | ||
2201 | if (pfmemalloc_watermark_ok(pgdat)) | ||
2202 | return; | ||
2203 | |||
2204 | /* Account for the throttling */ | ||
2205 | count_vm_event(PGSCAN_DIRECT_THROTTLE); | ||
2206 | |||
2207 | /* | ||
2208 | * If the caller cannot enter the filesystem, it's possible that it | ||
2209 | * is due to the caller holding an FS lock or performing a journal | ||
2210 | * transaction in the case of a filesystem like ext[3|4]. In this case, | ||
2211 | * it is not safe to block on pfmemalloc_wait as kswapd could be | ||
2212 | * blocked waiting on the same lock. Instead, throttle for up to a | ||
2213 | * second before continuing. | ||
2214 | */ | ||
2215 | if (!(gfp_mask & __GFP_FS)) { | ||
2216 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, | ||
2217 | pfmemalloc_watermark_ok(pgdat), HZ); | ||
2218 | return; | ||
2219 | } | ||
2220 | |||
2221 | /* Throttle until kswapd wakes the process */ | ||
2222 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, | ||
2223 | pfmemalloc_watermark_ok(pgdat)); | ||
2224 | } | ||
2225 | |||
2115 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 2226 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
2116 | gfp_t gfp_mask, nodemask_t *nodemask) | 2227 | gfp_t gfp_mask, nodemask_t *nodemask) |
2117 | { | 2228 | { |
@@ -2131,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2131 | .gfp_mask = sc.gfp_mask, | 2242 | .gfp_mask = sc.gfp_mask, |
2132 | }; | 2243 | }; |
2133 | 2244 | ||
2245 | throttle_direct_reclaim(gfp_mask, zonelist, nodemask); | ||
2246 | |||
2247 | /* | ||
2248 | * Do not enter reclaim if fatal signal is pending. 1 is returned so | ||
2249 | * that the page allocator does not consider triggering OOM | ||
2250 | */ | ||
2251 | if (fatal_signal_pending(current)) | ||
2252 | return 1; | ||
2253 | |||
2134 | trace_mm_vmscan_direct_reclaim_begin(order, | 2254 | trace_mm_vmscan_direct_reclaim_begin(order, |
2135 | sc.may_writepage, | 2255 | sc.may_writepage, |
2136 | gfp_mask); | 2256 | gfp_mask); |
@@ -2142,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2142 | return nr_reclaimed; | 2262 | return nr_reclaimed; |
2143 | } | 2263 | } |
2144 | 2264 | ||
2145 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 2265 | #ifdef CONFIG_MEMCG |
2146 | 2266 | ||
2147 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | 2267 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, |
2148 | gfp_t gfp_mask, bool noswap, | 2268 | gfp_t gfp_mask, bool noswap, |
@@ -2275,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | |||
2275 | return balanced_pages >= (present_pages >> 2); | 2395 | return balanced_pages >= (present_pages >> 2); |
2276 | } | 2396 | } |
2277 | 2397 | ||
2278 | /* is kswapd sleeping prematurely? */ | 2398 | /* |
2279 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | 2399 | * Prepare kswapd for sleeping. This verifies that there are no processes |
2400 | * waiting in throttle_direct_reclaim() and that watermarks have been met. | ||
2401 | * | ||
2402 | * Returns true if kswapd is ready to sleep | ||
2403 | */ | ||
2404 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | ||
2280 | int classzone_idx) | 2405 | int classzone_idx) |
2281 | { | 2406 | { |
2282 | int i; | 2407 | int i; |
@@ -2285,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2285 | 2410 | ||
2286 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2411 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
2287 | if (remaining) | 2412 | if (remaining) |
2288 | return true; | 2413 | return false; |
2414 | |||
2415 | /* | ||
2416 | * There is a potential race between when kswapd checks its watermarks | ||
2417 | * and a process gets throttled. There is also a potential race if | ||
2418 | * processes get throttled, kswapd wakes, a large process exits therby | ||
2419 | * balancing the zones that causes kswapd to miss a wakeup. If kswapd | ||
2420 | * is going to sleep, no process should be sleeping on pfmemalloc_wait | ||
2421 | * so wake them now if necessary. If necessary, processes will wake | ||
2422 | * kswapd and get throttled again | ||
2423 | */ | ||
2424 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) { | ||
2425 | wake_up(&pgdat->pfmemalloc_wait); | ||
2426 | return false; | ||
2427 | } | ||
2289 | 2428 | ||
2290 | /* Check the watermark levels */ | 2429 | /* Check the watermark levels */ |
2291 | for (i = 0; i <= classzone_idx; i++) { | 2430 | for (i = 0; i <= classzone_idx; i++) { |
@@ -2318,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2318 | * must be balanced | 2457 | * must be balanced |
2319 | */ | 2458 | */ |
2320 | if (order) | 2459 | if (order) |
2321 | return !pgdat_balanced(pgdat, balanced, classzone_idx); | 2460 | return pgdat_balanced(pgdat, balanced, classzone_idx); |
2322 | else | 2461 | else |
2323 | return !all_zones_ok; | 2462 | return all_zones_ok; |
2324 | } | 2463 | } |
2325 | 2464 | ||
2326 | /* | 2465 | /* |
@@ -2546,6 +2685,16 @@ loop_again: | |||
2546 | } | 2685 | } |
2547 | 2686 | ||
2548 | } | 2687 | } |
2688 | |||
2689 | /* | ||
2690 | * If the low watermark is met there is no need for processes | ||
2691 | * to be throttled on pfmemalloc_wait as they should not be | ||
2692 | * able to safely make forward progress. Wake them | ||
2693 | */ | ||
2694 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && | ||
2695 | pfmemalloc_watermark_ok(pgdat)) | ||
2696 | wake_up(&pgdat->pfmemalloc_wait); | ||
2697 | |||
2549 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) | 2698 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
2550 | break; /* kswapd: all done */ | 2699 | break; /* kswapd: all done */ |
2551 | /* | 2700 | /* |
@@ -2647,7 +2796,7 @@ out: | |||
2647 | } | 2796 | } |
2648 | 2797 | ||
2649 | /* | 2798 | /* |
2650 | * Return the order we were reclaiming at so sleeping_prematurely() | 2799 | * Return the order we were reclaiming at so prepare_kswapd_sleep() |
2651 | * makes a decision on the order we were last reclaiming at. However, | 2800 | * makes a decision on the order we were last reclaiming at. However, |
2652 | * if another caller entered the allocator slow path while kswapd | 2801 | * if another caller entered the allocator slow path while kswapd |
2653 | * was awake, order will remain at the higher level | 2802 | * was awake, order will remain at the higher level |
@@ -2667,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2667 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2816 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
2668 | 2817 | ||
2669 | /* Try to sleep for a short interval */ | 2818 | /* Try to sleep for a short interval */ |
2670 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2819 | if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { |
2671 | remaining = schedule_timeout(HZ/10); | 2820 | remaining = schedule_timeout(HZ/10); |
2672 | finish_wait(&pgdat->kswapd_wait, &wait); | 2821 | finish_wait(&pgdat->kswapd_wait, &wait); |
2673 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2822 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
@@ -2677,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2677 | * After a short sleep, check if it was a premature sleep. If not, then | 2826 | * After a short sleep, check if it was a premature sleep. If not, then |
2678 | * go fully to sleep until explicitly woken up. | 2827 | * go fully to sleep until explicitly woken up. |
2679 | */ | 2828 | */ |
2680 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2829 | if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { |
2681 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | 2830 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); |
2682 | 2831 | ||
2683 | /* | 2832 | /* |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 1bbbbd9776ad..df7a6748231d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -745,6 +745,7 @@ const char * const vmstat_text[] = { | |||
745 | TEXTS_FOR_ZONES("pgsteal_direct") | 745 | TEXTS_FOR_ZONES("pgsteal_direct") |
746 | TEXTS_FOR_ZONES("pgscan_kswapd") | 746 | TEXTS_FOR_ZONES("pgscan_kswapd") |
747 | TEXTS_FOR_ZONES("pgscan_direct") | 747 | TEXTS_FOR_ZONES("pgscan_direct") |
748 | "pgscan_direct_throttle", | ||
748 | 749 | ||
749 | #ifdef CONFIG_NUMA | 750 | #ifdef CONFIG_NUMA |
750 | "zone_reclaim_failed", | 751 | "zone_reclaim_failed", |