aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-07-31 22:25:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-31 22:25:39 -0400
commitac694dbdbc403c00e2c14d10bc7b8412cc378259 (patch)
treee37328cfbeaf43716dd5914cad9179e57e84df76 /mm
parenta40a1d3d0a2fd613fdec6d89d3c053268ced76ed (diff)
parent437ea90cc3afdca5229b41c6b1d38c4842756cb9 (diff)
Merge branch 'akpm' (Andrew's patch-bomb)
Merge Andrew's second set of patches: - MM - a few random fixes - a couple of RTC leftovers * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (120 commits) rtc/rtc-88pm80x: remove unneed devm_kfree rtc/rtc-88pm80x: assign ret only when rtc_register_driver fails mm: hugetlbfs: close race during teardown of hugetlbfs shared page tables tmpfs: distribute interleave better across nodes mm: remove redundant initialization mm: warn if pg_data_t isn't initialized with zero mips: zero out pg_data_t when it's allocated memcg: gix memory accounting scalability in shrink_page_list mm/sparse: remove index_init_lock mm/sparse: more checks on mem_section number mm/sparse: optimize sparse_index_alloc memcg: add mem_cgroup_from_css() helper memcg: further prevent OOM with too many dirty pages memcg: prevent OOM with too many dirty pages mm: mmu_notifier: fix freed page still mapped in secondary MMU mm: memcg: only check anon swapin page charges for swap cache mm: memcg: only check swap cache pages for repeated charging mm: memcg: split swapin charge function into private and public part mm: memcg: remove needless !mm fixup to init_mm when charging mm: memcg: remove unneeded shmem charge type ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig5
-rw-r--r--mm/Makefile8
-rw-r--r--mm/backing-dev.c20
-rw-r--r--mm/compaction.c63
-rw-r--r--mm/fadvise.c18
-rw-r--r--mm/highmem.c12
-rw-r--r--mm/hugetlb.c195
-rw-r--r--mm/hugetlb_cgroup.c418
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h8
-rw-r--r--mm/memblock.c35
-rw-r--r--mm/memcontrol.c390
-rw-r--r--mm/memory-failure.c17
-rw-r--r--mm/memory.c9
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/migrate.c81
-rw-r--r--mm/mmap.c5
-rw-r--r--mm/mmu_notifier.c45
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/oom_kill.c223
-rw-r--r--mm/page_alloc.c318
-rw-r--r--mm/page_cgroup.c2
-rw-r--r--mm/page_io.c145
-rw-r--r--mm/page_isolation.c93
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/slab.c216
-rw-r--r--mm/slub.c30
-rw-r--r--mm/sparse.c29
-rw-r--r--mm/swap.c52
-rw-r--r--mm/swap_state.c7
-rw-r--r--mm/swapfile.c145
-rw-r--r--mm/vmalloc.c16
-rw-r--r--mm/vmscan.c175
-rw-r--r--mm/vmstat.c1
35 files changed, 2043 insertions, 770 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 82fed4eb2b6f..d5c8019c6627 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
140config NO_BOOTMEM 140config NO_BOOTMEM
141 boolean 141 boolean
142 142
143config MEMORY_ISOLATION
144 boolean
145
143# eventually, we can have this option just 'select SPARSEMEM' 146# eventually, we can have this option just 'select SPARSEMEM'
144config MEMORY_HOTPLUG 147config MEMORY_HOTPLUG
145 bool "Allow for memory hot-add" 148 bool "Allow for memory hot-add"
149 select MEMORY_ISOLATION
146 depends on SPARSEMEM || X86_64_ACPI_NUMA 150 depends on SPARSEMEM || X86_64_ACPI_NUMA
147 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG 151 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
148 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) 152 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -272,6 +276,7 @@ config MEMORY_FAILURE
272 depends on MMU 276 depends on MMU
273 depends on ARCH_SUPPORTS_MEMORY_FAILURE 277 depends on ARCH_SUPPORTS_MEMORY_FAILURE
274 bool "Enable recovery from hardware memory errors" 278 bool "Enable recovery from hardware memory errors"
279 select MEMORY_ISOLATION
275 help 280 help
276 Enables code to recover from some memory failures on systems 281 Enables code to recover from some memory failures on systems
277 with MCA recovery. This allows a system to continue running 282 with MCA recovery. This allows a system to continue running
diff --git a/mm/Makefile b/mm/Makefile
index 8e81fe263c94..92753e2d82da 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,8 +15,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
15 maccess.o page_alloc.o page-writeback.o \ 15 maccess.o page_alloc.o page-writeback.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 17 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
18 page_isolation.o mm_init.o mmu_context.o percpu.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o slab_common.o $(mmu-y) 19 compaction.o $(mmu-y)
20 20
21obj-y += init-mm.o 21obj-y += init-mm.o
22 22
@@ -49,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
49obj-$(CONFIG_MIGRATION) += migrate.o 49obj-$(CONFIG_MIGRATION) += migrate.o
50obj-$(CONFIG_QUICKLIST) += quicklist.o 50obj-$(CONFIG_QUICKLIST) += quicklist.o
51obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 51obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
52obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 52obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
53obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
53obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 54obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
54obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 55obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
55obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o 56obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
56obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 57obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
57obj-$(CONFIG_CLEANCACHE) += cleancache.o 58obj-$(CONFIG_CLEANCACHE) += cleancache.o
59obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3387aea11209..6b4718e2ee34 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -886,3 +886,23 @@ out:
886 return ret; 886 return ret;
887} 887}
888EXPORT_SYMBOL(wait_iff_congested); 888EXPORT_SYMBOL(wait_iff_congested);
889
890int pdflush_proc_obsolete(struct ctl_table *table, int write,
891 void __user *buffer, size_t *lenp, loff_t *ppos)
892{
893 char kbuf[] = "0\n";
894
895 if (*ppos) {
896 *lenp = 0;
897 return 0;
898 }
899
900 if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
901 return -EFAULT;
902 printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
903 table->procname);
904
905 *lenp = 2;
906 *ppos += *lenp;
907 return 2;
908}
diff --git a/mm/compaction.c b/mm/compaction.c
index 2f42d9528539..e78cb9688421 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -422,6 +422,17 @@ static void isolate_freepages(struct zone *zone,
422 pfn -= pageblock_nr_pages) { 422 pfn -= pageblock_nr_pages) {
423 unsigned long isolated; 423 unsigned long isolated;
424 424
425 /*
426 * Skip ahead if another thread is compacting in the area
427 * simultaneously. If we wrapped around, we can only skip
428 * ahead if zone->compact_cached_free_pfn also wrapped to
429 * above our starting point.
430 */
431 if (cc->order > 0 && (!cc->wrapped ||
432 zone->compact_cached_free_pfn >
433 cc->start_free_pfn))
434 pfn = min(pfn, zone->compact_cached_free_pfn);
435
425 if (!pfn_valid(pfn)) 436 if (!pfn_valid(pfn))
426 continue; 437 continue;
427 438
@@ -461,8 +472,11 @@ static void isolate_freepages(struct zone *zone,
461 * looking for free pages, the search will restart here as 472 * looking for free pages, the search will restart here as
462 * page migration may have returned some pages to the allocator 473 * page migration may have returned some pages to the allocator
463 */ 474 */
464 if (isolated) 475 if (isolated) {
465 high_pfn = max(high_pfn, pfn); 476 high_pfn = max(high_pfn, pfn);
477 if (cc->order > 0)
478 zone->compact_cached_free_pfn = high_pfn;
479 }
466 } 480 }
467 481
468 /* split_free_page does not map the pages */ 482 /* split_free_page does not map the pages */
@@ -556,6 +570,20 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
556 return ISOLATE_SUCCESS; 570 return ISOLATE_SUCCESS;
557} 571}
558 572
573/*
574 * Returns the start pfn of the last page block in a zone. This is the starting
575 * point for full compaction of a zone. Compaction searches for free pages from
576 * the end of each zone, while isolate_freepages_block scans forward inside each
577 * page block.
578 */
579static unsigned long start_free_pfn(struct zone *zone)
580{
581 unsigned long free_pfn;
582 free_pfn = zone->zone_start_pfn + zone->spanned_pages;
583 free_pfn &= ~(pageblock_nr_pages-1);
584 return free_pfn;
585}
586
559static int compact_finished(struct zone *zone, 587static int compact_finished(struct zone *zone,
560 struct compact_control *cc) 588 struct compact_control *cc)
561{ 589{
@@ -565,8 +593,26 @@ static int compact_finished(struct zone *zone,
565 if (fatal_signal_pending(current)) 593 if (fatal_signal_pending(current))
566 return COMPACT_PARTIAL; 594 return COMPACT_PARTIAL;
567 595
568 /* Compaction run completes if the migrate and free scanner meet */ 596 /*
569 if (cc->free_pfn <= cc->migrate_pfn) 597 * A full (order == -1) compaction run starts at the beginning and
598 * end of a zone; it completes when the migrate and free scanner meet.
599 * A partial (order > 0) compaction can start with the free scanner
600 * at a random point in the zone, and may have to restart.
601 */
602 if (cc->free_pfn <= cc->migrate_pfn) {
603 if (cc->order > 0 && !cc->wrapped) {
604 /* We started partway through; restart at the end. */
605 unsigned long free_pfn = start_free_pfn(zone);
606 zone->compact_cached_free_pfn = free_pfn;
607 cc->free_pfn = free_pfn;
608 cc->wrapped = 1;
609 return COMPACT_CONTINUE;
610 }
611 return COMPACT_COMPLETE;
612 }
613
614 /* We wrapped around and ended up where we started. */
615 if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
570 return COMPACT_COMPLETE; 616 return COMPACT_COMPLETE;
571 617
572 /* 618 /*
@@ -664,8 +710,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
664 710
665 /* Setup to move all movable pages to the end of the zone */ 711 /* Setup to move all movable pages to the end of the zone */
666 cc->migrate_pfn = zone->zone_start_pfn; 712 cc->migrate_pfn = zone->zone_start_pfn;
667 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; 713
668 cc->free_pfn &= ~(pageblock_nr_pages-1); 714 if (cc->order > 0) {
715 /* Incremental compaction. Start where the last one stopped. */
716 cc->free_pfn = zone->compact_cached_free_pfn;
717 cc->start_free_pfn = cc->free_pfn;
718 } else {
719 /* Order == -1 starts at the end of the zone. */
720 cc->free_pfn = start_free_pfn(zone);
721 }
669 722
670 migrate_prep_local(); 723 migrate_prep_local();
671 724
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 469491e0af79..9b75a045dbf4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
93 spin_unlock(&file->f_lock); 93 spin_unlock(&file->f_lock);
94 break; 94 break;
95 case POSIX_FADV_WILLNEED: 95 case POSIX_FADV_WILLNEED:
96 if (!mapping->a_ops->readpage) {
97 ret = -EINVAL;
98 break;
99 }
100
101 /* First and last PARTIAL page! */ 96 /* First and last PARTIAL page! */
102 start_index = offset >> PAGE_CACHE_SHIFT; 97 start_index = offset >> PAGE_CACHE_SHIFT;
103 end_index = endbyte >> PAGE_CACHE_SHIFT; 98 end_index = endbyte >> PAGE_CACHE_SHIFT;
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
106 nrpages = end_index - start_index + 1; 101 nrpages = end_index - start_index + 1;
107 if (!nrpages) 102 if (!nrpages)
108 nrpages = ~0UL; 103 nrpages = ~0UL;
109 104
110 ret = force_page_cache_readahead(mapping, file, 105 /*
111 start_index, 106 * Ignore return value because fadvise() shall return
112 nrpages); 107 * success even if filesystem can't retrieve a hint,
113 if (ret > 0) 108 */
114 ret = 0; 109 force_page_cache_readahead(mapping, file, start_index,
110 nrpages);
115 break; 111 break;
116 case POSIX_FADV_NOREUSE: 112 case POSIX_FADV_NOREUSE:
117 break; 113 break;
diff --git a/mm/highmem.c b/mm/highmem.c
index 57d82c6250c3..d517cd16a6eb 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
94 do { spin_unlock(&kmap_lock); (void)(flags); } while (0) 94 do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
95#endif 95#endif
96 96
97struct page *kmap_to_page(void *vaddr)
98{
99 unsigned long addr = (unsigned long)vaddr;
100
101 if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
102 int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
103 return pte_page(pkmap_page_table[i]);
104 }
105
106 return virt_to_page(addr);
107}
108
97static void flush_all_zero_pkmaps(void) 109static void flush_all_zero_pkmaps(void)
98{ 110{
99 int i; 111 int i;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e198831276a3..bc727122dd44 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,17 +24,20 @@
24 24
25#include <asm/page.h> 25#include <asm/page.h>
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <linux/io.h> 27#include <asm/tlb.h>
28 28
29#include <linux/io.h>
29#include <linux/hugetlb.h> 30#include <linux/hugetlb.h>
31#include <linux/hugetlb_cgroup.h>
30#include <linux/node.h> 32#include <linux/node.h>
33#include <linux/hugetlb_cgroup.h>
31#include "internal.h" 34#include "internal.h"
32 35
33const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 36const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
34static gfp_t htlb_alloc_mask = GFP_HIGHUSER; 37static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
35unsigned long hugepages_treat_as_movable; 38unsigned long hugepages_treat_as_movable;
36 39
37static int max_hstate; 40int hugetlb_max_hstate __read_mostly;
38unsigned int default_hstate_idx; 41unsigned int default_hstate_idx;
39struct hstate hstates[HUGE_MAX_HSTATE]; 42struct hstate hstates[HUGE_MAX_HSTATE];
40 43
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;
45static unsigned long __initdata default_hstate_max_huge_pages; 48static unsigned long __initdata default_hstate_max_huge_pages;
46static unsigned long __initdata default_hstate_size; 49static unsigned long __initdata default_hstate_size;
47 50
48#define for_each_hstate(h) \
49 for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
50
51/* 51/*
52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 52 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
53 */ 53 */
54static DEFINE_SPINLOCK(hugetlb_lock); 54DEFINE_SPINLOCK(hugetlb_lock);
55 55
56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57{ 57{
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)
509static void enqueue_huge_page(struct hstate *h, struct page *page) 509static void enqueue_huge_page(struct hstate *h, struct page *page)
510{ 510{
511 int nid = page_to_nid(page); 511 int nid = page_to_nid(page);
512 list_add(&page->lru, &h->hugepage_freelists[nid]); 512 list_move(&page->lru, &h->hugepage_freelists[nid]);
513 h->free_huge_pages++; 513 h->free_huge_pages++;
514 h->free_huge_pages_node[nid]++; 514 h->free_huge_pages_node[nid]++;
515} 515}
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
521 if (list_empty(&h->hugepage_freelists[nid])) 521 if (list_empty(&h->hugepage_freelists[nid]))
522 return NULL; 522 return NULL;
523 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); 523 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
524 list_del(&page->lru); 524 list_move(&page->lru, &h->hugepage_activelist);
525 set_page_refcounted(page); 525 set_page_refcounted(page);
526 h->free_huge_pages--; 526 h->free_huge_pages--;
527 h->free_huge_pages_node[nid]--; 527 h->free_huge_pages_node[nid]--;
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
593 1 << PG_active | 1 << PG_reserved | 593 1 << PG_active | 1 << PG_reserved |
594 1 << PG_private | 1 << PG_writeback); 594 1 << PG_private | 1 << PG_writeback);
595 } 595 }
596 VM_BUG_ON(hugetlb_cgroup_from_page(page));
596 set_compound_page_dtor(page, NULL); 597 set_compound_page_dtor(page, NULL);
597 set_page_refcounted(page); 598 set_page_refcounted(page);
598 arch_release_hugepage(page); 599 arch_release_hugepage(page);
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)
625 page->mapping = NULL; 626 page->mapping = NULL;
626 BUG_ON(page_count(page)); 627 BUG_ON(page_count(page));
627 BUG_ON(page_mapcount(page)); 628 BUG_ON(page_mapcount(page));
628 INIT_LIST_HEAD(&page->lru);
629 629
630 spin_lock(&hugetlb_lock); 630 spin_lock(&hugetlb_lock);
631 hugetlb_cgroup_uncharge_page(hstate_index(h),
632 pages_per_huge_page(h), page);
631 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { 633 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
634 /* remove the page from active list */
635 list_del(&page->lru);
632 update_and_free_page(h, page); 636 update_and_free_page(h, page);
633 h->surplus_huge_pages--; 637 h->surplus_huge_pages--;
634 h->surplus_huge_pages_node[nid]--; 638 h->surplus_huge_pages_node[nid]--;
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)
641 645
642static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 646static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
643{ 647{
648 INIT_LIST_HEAD(&page->lru);
644 set_compound_page_dtor(page, free_huge_page); 649 set_compound_page_dtor(page, free_huge_page);
645 spin_lock(&hugetlb_lock); 650 spin_lock(&hugetlb_lock);
651 set_hugetlb_cgroup(page, NULL);
646 h->nr_huge_pages++; 652 h->nr_huge_pages++;
647 h->nr_huge_pages_node[nid]++; 653 h->nr_huge_pages_node[nid]++;
648 spin_unlock(&hugetlb_lock); 654 spin_unlock(&hugetlb_lock);
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
889 895
890 spin_lock(&hugetlb_lock); 896 spin_lock(&hugetlb_lock);
891 if (page) { 897 if (page) {
898 INIT_LIST_HEAD(&page->lru);
892 r_nid = page_to_nid(page); 899 r_nid = page_to_nid(page);
893 set_compound_page_dtor(page, free_huge_page); 900 set_compound_page_dtor(page, free_huge_page);
901 set_hugetlb_cgroup(page, NULL);
894 /* 902 /*
895 * We incremented the global counters already 903 * We incremented the global counters already
896 */ 904 */
@@ -993,7 +1001,6 @@ retry:
993 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1001 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
994 if ((--needed) < 0) 1002 if ((--needed) < 0)
995 break; 1003 break;
996 list_del(&page->lru);
997 /* 1004 /*
998 * This page is now managed by the hugetlb allocator and has 1005 * This page is now managed by the hugetlb allocator and has
999 * no users -- drop the buddy allocator's reference. 1006 * no users -- drop the buddy allocator's reference.
@@ -1008,7 +1015,6 @@ free:
1008 /* Free unnecessary surplus pages to the buddy allocator */ 1015 /* Free unnecessary surplus pages to the buddy allocator */
1009 if (!list_empty(&surplus_list)) { 1016 if (!list_empty(&surplus_list)) {
1010 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1017 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
1011 list_del(&page->lru);
1012 put_page(page); 1018 put_page(page);
1013 } 1019 }
1014 } 1020 }
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1112 struct hstate *h = hstate_vma(vma); 1118 struct hstate *h = hstate_vma(vma);
1113 struct page *page; 1119 struct page *page;
1114 long chg; 1120 long chg;
1121 int ret, idx;
1122 struct hugetlb_cgroup *h_cg;
1115 1123
1124 idx = hstate_index(h);
1116 /* 1125 /*
1117 * Processes that did not create the mapping will have no 1126 * Processes that did not create the mapping will have no
1118 * reserves and will not have accounted against subpool 1127 * reserves and will not have accounted against subpool
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1123 */ 1132 */
1124 chg = vma_needs_reservation(h, vma, addr); 1133 chg = vma_needs_reservation(h, vma, addr);
1125 if (chg < 0) 1134 if (chg < 0)
1126 return ERR_PTR(-VM_FAULT_OOM); 1135 return ERR_PTR(-ENOMEM);
1127 if (chg) 1136 if (chg)
1128 if (hugepage_subpool_get_pages(spool, chg)) 1137 if (hugepage_subpool_get_pages(spool, chg))
1129 return ERR_PTR(-VM_FAULT_SIGBUS); 1138 return ERR_PTR(-ENOSPC);
1130 1139
1140 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1141 if (ret) {
1142 hugepage_subpool_put_pages(spool, chg);
1143 return ERR_PTR(-ENOSPC);
1144 }
1131 spin_lock(&hugetlb_lock); 1145 spin_lock(&hugetlb_lock);
1132 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1146 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
1133 spin_unlock(&hugetlb_lock); 1147 if (page) {
1134 1148 /* update page cgroup details */
1135 if (!page) { 1149 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1150 h_cg, page);
1151 spin_unlock(&hugetlb_lock);
1152 } else {
1153 spin_unlock(&hugetlb_lock);
1136 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1154 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1137 if (!page) { 1155 if (!page) {
1156 hugetlb_cgroup_uncharge_cgroup(idx,
1157 pages_per_huge_page(h),
1158 h_cg);
1138 hugepage_subpool_put_pages(spool, chg); 1159 hugepage_subpool_put_pages(spool, chg);
1139 return ERR_PTR(-VM_FAULT_SIGBUS); 1160 return ERR_PTR(-ENOSPC);
1140 } 1161 }
1162 spin_lock(&hugetlb_lock);
1163 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1164 h_cg, page);
1165 list_move(&page->lru, &h->hugepage_activelist);
1166 spin_unlock(&hugetlb_lock);
1141 } 1167 }
1142 1168
1143 set_page_private(page, (unsigned long)spool); 1169 set_page_private(page, (unsigned long)spool);
1144 1170
1145 vma_commit_reservation(h, vma, addr); 1171 vma_commit_reservation(h, vma, addr);
1146
1147 return page; 1172 return page;
1148} 1173}
1149 1174
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
1646 struct attribute_group *hstate_attr_group) 1671 struct attribute_group *hstate_attr_group)
1647{ 1672{
1648 int retval; 1673 int retval;
1649 int hi = h - hstates; 1674 int hi = hstate_index(h);
1650 1675
1651 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); 1676 hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
1652 if (!hstate_kobjs[hi]) 1677 if (!hstate_kobjs[hi])
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)
1741 if (!nhs->hugepages_kobj) 1766 if (!nhs->hugepages_kobj)
1742 return; /* no hstate attributes */ 1767 return; /* no hstate attributes */
1743 1768
1744 for_each_hstate(h) 1769 for_each_hstate(h) {
1745 if (nhs->hstate_kobjs[h - hstates]) { 1770 int idx = hstate_index(h);
1746 kobject_put(nhs->hstate_kobjs[h - hstates]); 1771 if (nhs->hstate_kobjs[idx]) {
1747 nhs->hstate_kobjs[h - hstates] = NULL; 1772 kobject_put(nhs->hstate_kobjs[idx]);
1773 nhs->hstate_kobjs[idx] = NULL;
1748 } 1774 }
1775 }
1749 1776
1750 kobject_put(nhs->hugepages_kobj); 1777 kobject_put(nhs->hugepages_kobj);
1751 nhs->hugepages_kobj = NULL; 1778 nhs->hugepages_kobj = NULL;
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)
1848 hugetlb_unregister_all_nodes(); 1875 hugetlb_unregister_all_nodes();
1849 1876
1850 for_each_hstate(h) { 1877 for_each_hstate(h) {
1851 kobject_put(hstate_kobjs[h - hstates]); 1878 kobject_put(hstate_kobjs[hstate_index(h)]);
1852 } 1879 }
1853 1880
1854 kobject_put(hugepages_kobj); 1881 kobject_put(hugepages_kobj);
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)
1869 if (!size_to_hstate(default_hstate_size)) 1896 if (!size_to_hstate(default_hstate_size))
1870 hugetlb_add_hstate(HUGETLB_PAGE_ORDER); 1897 hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
1871 } 1898 }
1872 default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; 1899 default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
1873 if (default_hstate_max_huge_pages) 1900 if (default_hstate_max_huge_pages)
1874 default_hstate.max_huge_pages = default_hstate_max_huge_pages; 1901 default_hstate.max_huge_pages = default_hstate_max_huge_pages;
1875 1902
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)
1897 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); 1924 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
1898 return; 1925 return;
1899 } 1926 }
1900 BUG_ON(max_hstate >= HUGE_MAX_HSTATE); 1927 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
1901 BUG_ON(order == 0); 1928 BUG_ON(order == 0);
1902 h = &hstates[max_hstate++]; 1929 h = &hstates[hugetlb_max_hstate++];
1903 h->order = order; 1930 h->order = order;
1904 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); 1931 h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
1905 h->nr_huge_pages = 0; 1932 h->nr_huge_pages = 0;
1906 h->free_huge_pages = 0; 1933 h->free_huge_pages = 0;
1907 for (i = 0; i < MAX_NUMNODES; ++i) 1934 for (i = 0; i < MAX_NUMNODES; ++i)
1908 INIT_LIST_HEAD(&h->hugepage_freelists[i]); 1935 INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1936 INIT_LIST_HEAD(&h->hugepage_activelist);
1909 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); 1937 h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
1910 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); 1938 h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
1911 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", 1939 snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1912 huge_page_size(h)/1024); 1940 huge_page_size(h)/1024);
1941 /*
1942 * Add cgroup control files only if the huge page consists
1943 * of more than two normal pages. This is because we use
1944 * page[2].lru.next for storing cgoup details.
1945 */
1946 if (order >= HUGETLB_CGROUP_MIN_ORDER)
1947 hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
1913 1948
1914 parsed_hstate = h; 1949 parsed_hstate = h;
1915} 1950}
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)
1920 static unsigned long *last_mhp; 1955 static unsigned long *last_mhp;
1921 1956
1922 /* 1957 /*
1923 * !max_hstate means we haven't parsed a hugepagesz= parameter yet, 1958 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
1924 * so this hugepages= parameter goes to the "default hstate". 1959 * so this hugepages= parameter goes to the "default hstate".
1925 */ 1960 */
1926 if (!max_hstate) 1961 if (!hugetlb_max_hstate)
1927 mhp = &default_hstate_max_huge_pages; 1962 mhp = &default_hstate_max_huge_pages;
1928 else 1963 else
1929 mhp = &parsed_hstate->max_huge_pages; 1964 mhp = &parsed_hstate->max_huge_pages;
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)
1942 * But we need to allocate >= MAX_ORDER hstates here early to still 1977 * But we need to allocate >= MAX_ORDER hstates here early to still
1943 * use the bootmem allocator. 1978 * use the bootmem allocator.
1944 */ 1979 */
1945 if (max_hstate && parsed_hstate->order >= MAX_ORDER) 1980 if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
1946 hugetlb_hstate_alloc_pages(parsed_hstate); 1981 hugetlb_hstate_alloc_pages(parsed_hstate);
1947 1982
1948 last_mhp = mhp; 1983 last_mhp = mhp;
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2308 return 0; 2343 return 0;
2309} 2344}
2310 2345
2311void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2346void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2312 unsigned long end, struct page *ref_page) 2347 unsigned long start, unsigned long end,
2348 struct page *ref_page)
2313{ 2349{
2350 int force_flush = 0;
2314 struct mm_struct *mm = vma->vm_mm; 2351 struct mm_struct *mm = vma->vm_mm;
2315 unsigned long address; 2352 unsigned long address;
2316 pte_t *ptep; 2353 pte_t *ptep;
2317 pte_t pte; 2354 pte_t pte;
2318 struct page *page; 2355 struct page *page;
2319 struct page *tmp;
2320 struct hstate *h = hstate_vma(vma); 2356 struct hstate *h = hstate_vma(vma);
2321 unsigned long sz = huge_page_size(h); 2357 unsigned long sz = huge_page_size(h);
2322 2358
2323 /*
2324 * A page gathering list, protected by per file i_mmap_mutex. The
2325 * lock is used to avoid list corruption from multiple unmapping
2326 * of the same page since we are using page->lru.
2327 */
2328 LIST_HEAD(page_list);
2329
2330 WARN_ON(!is_vm_hugetlb_page(vma)); 2359 WARN_ON(!is_vm_hugetlb_page(vma));
2331 BUG_ON(start & ~huge_page_mask(h)); 2360 BUG_ON(start & ~huge_page_mask(h));
2332 BUG_ON(end & ~huge_page_mask(h)); 2361 BUG_ON(end & ~huge_page_mask(h));
2333 2362
2363 tlb_start_vma(tlb, vma);
2334 mmu_notifier_invalidate_range_start(mm, start, end); 2364 mmu_notifier_invalidate_range_start(mm, start, end);
2365again:
2335 spin_lock(&mm->page_table_lock); 2366 spin_lock(&mm->page_table_lock);
2336 for (address = start; address < end; address += sz) { 2367 for (address = start; address < end; address += sz) {
2337 ptep = huge_pte_offset(mm, address); 2368 ptep = huge_pte_offset(mm, address);
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2370 } 2401 }
2371 2402
2372 pte = huge_ptep_get_and_clear(mm, address, ptep); 2403 pte = huge_ptep_get_and_clear(mm, address, ptep);
2404 tlb_remove_tlb_entry(tlb, ptep, address);
2373 if (pte_dirty(pte)) 2405 if (pte_dirty(pte))
2374 set_page_dirty(page); 2406 set_page_dirty(page);
2375 list_add(&page->lru, &page_list);
2376 2407
2408 page_remove_rmap(page);
2409 force_flush = !__tlb_remove_page(tlb, page);
2410 if (force_flush)
2411 break;
2377 /* Bail out after unmapping reference page if supplied */ 2412 /* Bail out after unmapping reference page if supplied */
2378 if (ref_page) 2413 if (ref_page)
2379 break; 2414 break;
2380 } 2415 }
2381 flush_tlb_range(vma, start, end);
2382 spin_unlock(&mm->page_table_lock); 2416 spin_unlock(&mm->page_table_lock);
2383 mmu_notifier_invalidate_range_end(mm, start, end); 2417 /*
2384 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2418 * mmu_gather ran out of room to batch pages, we break out of
2385 page_remove_rmap(page); 2419 * the PTE lock to avoid doing the potential expensive TLB invalidate
2386 list_del(&page->lru); 2420 * and page-free while holding it.
2387 put_page(page); 2421 */
2422 if (force_flush) {
2423 force_flush = 0;
2424 tlb_flush_mmu(tlb);
2425 if (address < end && !ref_page)
2426 goto again;
2388 } 2427 }
2428 mmu_notifier_invalidate_range_end(mm, start, end);
2429 tlb_end_vma(tlb, vma);
2430}
2431
2432void __unmap_hugepage_range_final(struct mmu_gather *tlb,
2433 struct vm_area_struct *vma, unsigned long start,
2434 unsigned long end, struct page *ref_page)
2435{
2436 __unmap_hugepage_range(tlb, vma, start, end, ref_page);
2437
2438 /*
2439 * Clear this flag so that x86's huge_pmd_share page_table_shareable
2440 * test will fail on a vma being torn down, and not grab a page table
2441 * on its way out. We're lucky that the flag has such an appropriate
2442 * name, and can in fact be safely cleared here. We could clear it
2443 * before the __unmap_hugepage_range above, but all that's necessary
2444 * is to clear it before releasing the i_mmap_mutex. This works
2445 * because in the context this is called, the VMA is about to be
2446 * destroyed and the i_mmap_mutex is held.
2447 */
2448 vma->vm_flags &= ~VM_MAYSHARE;
2389} 2449}
2390 2450
2391void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2451void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2392 unsigned long end, struct page *ref_page) 2452 unsigned long end, struct page *ref_page)
2393{ 2453{
2394 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 2454 struct mm_struct *mm;
2395 __unmap_hugepage_range(vma, start, end, ref_page); 2455 struct mmu_gather tlb;
2396 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 2456
2457 mm = vma->vm_mm;
2458
2459 tlb_gather_mmu(&tlb, mm, 0);
2460 __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
2461 tlb_finish_mmu(&tlb, start, end);
2397} 2462}
2398 2463
2399/* 2464/*
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2438 * from the time of fork. This would look like data corruption 2503 * from the time of fork. This would look like data corruption
2439 */ 2504 */
2440 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 2505 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
2441 __unmap_hugepage_range(iter_vma, 2506 unmap_hugepage_range(iter_vma, address,
2442 address, address + huge_page_size(h), 2507 address + huge_page_size(h), page);
2443 page);
2444 } 2508 }
2445 mutex_unlock(&mapping->i_mmap_mutex); 2509 mutex_unlock(&mapping->i_mmap_mutex);
2446 2510
@@ -2496,6 +2560,7 @@ retry_avoidcopy:
2496 new_page = alloc_huge_page(vma, address, outside_reserve); 2560 new_page = alloc_huge_page(vma, address, outside_reserve);
2497 2561
2498 if (IS_ERR(new_page)) { 2562 if (IS_ERR(new_page)) {
2563 long err = PTR_ERR(new_page);
2499 page_cache_release(old_page); 2564 page_cache_release(old_page);
2500 2565
2501 /* 2566 /*
@@ -2524,7 +2589,10 @@ retry_avoidcopy:
2524 2589
2525 /* Caller expects lock to be held */ 2590 /* Caller expects lock to be held */
2526 spin_lock(&mm->page_table_lock); 2591 spin_lock(&mm->page_table_lock);
2527 return -PTR_ERR(new_page); 2592 if (err == -ENOMEM)
2593 return VM_FAULT_OOM;
2594 else
2595 return VM_FAULT_SIGBUS;
2528 } 2596 }
2529 2597
2530 /* 2598 /*
@@ -2642,7 +2710,11 @@ retry:
2642 goto out; 2710 goto out;
2643 page = alloc_huge_page(vma, address, 0); 2711 page = alloc_huge_page(vma, address, 0);
2644 if (IS_ERR(page)) { 2712 if (IS_ERR(page)) {
2645 ret = -PTR_ERR(page); 2713 ret = PTR_ERR(page);
2714 if (ret == -ENOMEM)
2715 ret = VM_FAULT_OOM;
2716 else
2717 ret = VM_FAULT_SIGBUS;
2646 goto out; 2718 goto out;
2647 } 2719 }
2648 clear_huge_page(page, address, pages_per_huge_page(h)); 2720 clear_huge_page(page, address, pages_per_huge_page(h));
@@ -2679,7 +2751,7 @@ retry:
2679 */ 2751 */
2680 if (unlikely(PageHWPoison(page))) { 2752 if (unlikely(PageHWPoison(page))) {
2681 ret = VM_FAULT_HWPOISON | 2753 ret = VM_FAULT_HWPOISON |
2682 VM_FAULT_SET_HINDEX(h - hstates); 2754 VM_FAULT_SET_HINDEX(hstate_index(h));
2683 goto backout_unlocked; 2755 goto backout_unlocked;
2684 } 2756 }
2685 } 2757 }
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2752 return 0; 2824 return 0;
2753 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2825 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2754 return VM_FAULT_HWPOISON_LARGE | 2826 return VM_FAULT_HWPOISON_LARGE |
2755 VM_FAULT_SET_HINDEX(h - hstates); 2827 VM_FAULT_SET_HINDEX(hstate_index(h));
2756 } 2828 }
2757 2829
2758 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2830 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2959 } 3031 }
2960 } 3032 }
2961 spin_unlock(&mm->page_table_lock); 3033 spin_unlock(&mm->page_table_lock);
2962 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); 3034 /*
2963 3035 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
3036 * may have cleared our pud entry and done put_page on the page table:
3037 * once we release i_mmap_mutex, another task can do the final put_page
3038 * and that page table be reused and filled with junk.
3039 */
2964 flush_tlb_range(vma, start, end); 3040 flush_tlb_range(vma, start, end);
3041 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
2965} 3042}
2966 3043
2967int hugetlb_reserve_pages(struct inode *inode, 3044int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 000000000000..a3f358fb8a0c
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,418 @@
1/*
2 *
3 * Copyright IBM Corporation, 2012
4 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of version 2.1 of the GNU Lesser General Public License
8 * as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it would be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 *
14 */
15
16#include <linux/cgroup.h>
17#include <linux/slab.h>
18#include <linux/hugetlb.h>
19#include <linux/hugetlb_cgroup.h>
20
21struct hugetlb_cgroup {
22 struct cgroup_subsys_state css;
23 /*
24 * the counter to account for hugepages from hugetlb.
25 */
26 struct res_counter hugepage[HUGE_MAX_HSTATE];
27};
28
29#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
30#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
31#define MEMFILE_ATTR(val) ((val) & 0xffff)
32
33struct cgroup_subsys hugetlb_subsys __read_mostly;
34static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
35
36static inline
37struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
38{
39 return container_of(s, struct hugetlb_cgroup, css);
40}
41
42static inline
43struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
44{
45 return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
46 hugetlb_subsys_id));
47}
48
49static inline
50struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
51{
52 return hugetlb_cgroup_from_css(task_subsys_state(task,
53 hugetlb_subsys_id));
54}
55
56static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
57{
58 return (h_cg == root_h_cgroup);
59}
60
61static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg)
62{
63 if (!cg->parent)
64 return NULL;
65 return hugetlb_cgroup_from_cgroup(cg->parent);
66}
67
68static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
69{
70 int idx;
71 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
72
73 for (idx = 0; idx < hugetlb_max_hstate; idx++) {
74 if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
75 return true;
76 }
77 return false;
78}
79
80static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
81{
82 int idx;
83 struct cgroup *parent_cgroup;
84 struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
85
86 h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
87 if (!h_cgroup)
88 return ERR_PTR(-ENOMEM);
89
90 parent_cgroup = cgroup->parent;
91 if (parent_cgroup) {
92 parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
93 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
94 res_counter_init(&h_cgroup->hugepage[idx],
95 &parent_h_cgroup->hugepage[idx]);
96 } else {
97 root_h_cgroup = h_cgroup;
98 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
99 res_counter_init(&h_cgroup->hugepage[idx], NULL);
100 }
101 return &h_cgroup->css;
102}
103
104static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
105{
106 struct hugetlb_cgroup *h_cgroup;
107
108 h_cgroup = hugetlb_cgroup_from_cgroup(cgroup);
109 kfree(h_cgroup);
110}
111
112
113/*
114 * Should be called with hugetlb_lock held.
115 * Since we are holding hugetlb_lock, pages cannot get moved from
116 * active list or uncharged from the cgroup, So no need to get
117 * page reference and test for page active here. This function
118 * cannot fail.
119 */
120static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup,
121 struct page *page)
122{
123 int csize;
124 struct res_counter *counter;
125 struct res_counter *fail_res;
126 struct hugetlb_cgroup *page_hcg;
127 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
128 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
129
130 page_hcg = hugetlb_cgroup_from_page(page);
131 /*
132 * We can have pages in active list without any cgroup
133 * ie, hugepage with less than 3 pages. We can safely
134 * ignore those pages.
135 */
136 if (!page_hcg || page_hcg != h_cg)
137 goto out;
138
139 csize = PAGE_SIZE << compound_order(page);
140 if (!parent) {
141 parent = root_h_cgroup;
142 /* root has no limit */
143 res_counter_charge_nofail(&parent->hugepage[idx],
144 csize, &fail_res);
145 }
146 counter = &h_cg->hugepage[idx];
147 res_counter_uncharge_until(counter, counter->parent, csize);
148
149 set_hugetlb_cgroup(page, parent);
150out:
151 return;
152}
153
154/*
155 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
156 * the parent cgroup.
157 */
158static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
159{
160 struct hstate *h;
161 struct page *page;
162 int ret = 0, idx = 0;
163
164 do {
165 if (cgroup_task_count(cgroup) ||
166 !list_empty(&cgroup->children)) {
167 ret = -EBUSY;
168 goto out;
169 }
170 for_each_hstate(h) {
171 spin_lock(&hugetlb_lock);
172 list_for_each_entry(page, &h->hugepage_activelist, lru)
173 hugetlb_cgroup_move_parent(idx, cgroup, page);
174
175 spin_unlock(&hugetlb_lock);
176 idx++;
177 }
178 cond_resched();
179 } while (hugetlb_cgroup_have_usage(cgroup));
180out:
181 return ret;
182}
183
184int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
185 struct hugetlb_cgroup **ptr)
186{
187 int ret = 0;
188 struct res_counter *fail_res;
189 struct hugetlb_cgroup *h_cg = NULL;
190 unsigned long csize = nr_pages * PAGE_SIZE;
191
192 if (hugetlb_cgroup_disabled())
193 goto done;
194 /*
195 * We don't charge any cgroup if the compound page have less
196 * than 3 pages.
197 */
198 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
199 goto done;
200again:
201 rcu_read_lock();
202 h_cg = hugetlb_cgroup_from_task(current);
203 if (!css_tryget(&h_cg->css)) {
204 rcu_read_unlock();
205 goto again;
206 }
207 rcu_read_unlock();
208
209 ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
210 css_put(&h_cg->css);
211done:
212 *ptr = h_cg;
213 return ret;
214}
215
216/* Should be called with hugetlb_lock held */
217void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
218 struct hugetlb_cgroup *h_cg,
219 struct page *page)
220{
221 if (hugetlb_cgroup_disabled() || !h_cg)
222 return;
223
224 set_hugetlb_cgroup(page, h_cg);
225 return;
226}
227
228/*
229 * Should be called with hugetlb_lock held
230 */
231void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
232 struct page *page)
233{
234 struct hugetlb_cgroup *h_cg;
235 unsigned long csize = nr_pages * PAGE_SIZE;
236
237 if (hugetlb_cgroup_disabled())
238 return;
239 VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
240 h_cg = hugetlb_cgroup_from_page(page);
241 if (unlikely(!h_cg))
242 return;
243 set_hugetlb_cgroup(page, NULL);
244 res_counter_uncharge(&h_cg->hugepage[idx], csize);
245 return;
246}
247
248void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
249 struct hugetlb_cgroup *h_cg)
250{
251 unsigned long csize = nr_pages * PAGE_SIZE;
252
253 if (hugetlb_cgroup_disabled() || !h_cg)
254 return;
255
256 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
257 return;
258
259 res_counter_uncharge(&h_cg->hugepage[idx], csize);
260 return;
261}
262
263static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
264 struct file *file, char __user *buf,
265 size_t nbytes, loff_t *ppos)
266{
267 u64 val;
268 char str[64];
269 int idx, name, len;
270 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
271
272 idx = MEMFILE_IDX(cft->private);
273 name = MEMFILE_ATTR(cft->private);
274
275 val = res_counter_read_u64(&h_cg->hugepage[idx], name);
276 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
277 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
278}
279
280static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
281 const char *buffer)
282{
283 int idx, name, ret;
284 unsigned long long val;
285 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
286
287 idx = MEMFILE_IDX(cft->private);
288 name = MEMFILE_ATTR(cft->private);
289
290 switch (name) {
291 case RES_LIMIT:
292 if (hugetlb_cgroup_is_root(h_cg)) {
293 /* Can't set limit on root */
294 ret = -EINVAL;
295 break;
296 }
297 /* This function does all necessary parse...reuse it */
298 ret = res_counter_memparse_write_strategy(buffer, &val);
299 if (ret)
300 break;
301 ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
302 break;
303 default:
304 ret = -EINVAL;
305 break;
306 }
307 return ret;
308}
309
310static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event)
311{
312 int idx, name, ret = 0;
313 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
314
315 idx = MEMFILE_IDX(event);
316 name = MEMFILE_ATTR(event);
317
318 switch (name) {
319 case RES_MAX_USAGE:
320 res_counter_reset_max(&h_cg->hugepage[idx]);
321 break;
322 case RES_FAILCNT:
323 res_counter_reset_failcnt(&h_cg->hugepage[idx]);
324 break;
325 default:
326 ret = -EINVAL;
327 break;
328 }
329 return ret;
330}
331
332static char *mem_fmt(char *buf, int size, unsigned long hsize)
333{
334 if (hsize >= (1UL << 30))
335 snprintf(buf, size, "%luGB", hsize >> 30);
336 else if (hsize >= (1UL << 20))
337 snprintf(buf, size, "%luMB", hsize >> 20);
338 else
339 snprintf(buf, size, "%luKB", hsize >> 10);
340 return buf;
341}
342
343int __init hugetlb_cgroup_file_init(int idx)
344{
345 char buf[32];
346 struct cftype *cft;
347 struct hstate *h = &hstates[idx];
348
349 /* format the size */
350 mem_fmt(buf, 32, huge_page_size(h));
351
352 /* Add the limit file */
353 cft = &h->cgroup_files[0];
354 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
355 cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
356 cft->read = hugetlb_cgroup_read;
357 cft->write_string = hugetlb_cgroup_write;
358
359 /* Add the usage file */
360 cft = &h->cgroup_files[1];
361 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
362 cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
363 cft->read = hugetlb_cgroup_read;
364
365 /* Add the MAX usage file */
366 cft = &h->cgroup_files[2];
367 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
368 cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
369 cft->trigger = hugetlb_cgroup_reset;
370 cft->read = hugetlb_cgroup_read;
371
372 /* Add the failcntfile */
373 cft = &h->cgroup_files[3];
374 snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
375 cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
376 cft->trigger = hugetlb_cgroup_reset;
377 cft->read = hugetlb_cgroup_read;
378
379 /* NULL terminate the last cft */
380 cft = &h->cgroup_files[4];
381 memset(cft, 0, sizeof(*cft));
382
383 WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
384
385 return 0;
386}
387
388/*
389 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
390 * when we migrate hugepages
391 */
392void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
393{
394 struct hugetlb_cgroup *h_cg;
395 struct hstate *h = page_hstate(oldhpage);
396
397 if (hugetlb_cgroup_disabled())
398 return;
399
400 VM_BUG_ON(!PageHuge(oldhpage));
401 spin_lock(&hugetlb_lock);
402 h_cg = hugetlb_cgroup_from_page(oldhpage);
403 set_hugetlb_cgroup(oldhpage, NULL);
404
405 /* move the h_cg details to new cgroup */
406 set_hugetlb_cgroup(newhpage, h_cg);
407 list_move(&newhpage->lru, &h->hugepage_activelist);
408 spin_unlock(&hugetlb_lock);
409 return;
410}
411
412struct cgroup_subsys hugetlb_subsys = {
413 .name = "hugetlb",
414 .create = hugetlb_cgroup_create,
415 .pre_destroy = hugetlb_cgroup_pre_destroy,
416 .destroy = hugetlb_cgroup_destroy,
417 .subsys_id = hugetlb_subsys_id,
418};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index cc448bb983ba..3a61efc518d5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -123,7 +123,7 @@ static int pfn_inject_init(void)
123 if (!dentry) 123 if (!dentry)
124 goto fail; 124 goto fail;
125 125
126#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 126#ifdef CONFIG_MEMCG_SWAP
127 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, 127 dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
128 hwpoison_dir, &hwpoison_filter_memcg); 128 hwpoison_dir, &hwpoison_filter_memcg);
129 if (!dentry) 129 if (!dentry)
diff --git a/mm/internal.h b/mm/internal.h
index 2ba87fbfb75b..3314f79d775a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,8 +118,14 @@ struct compact_control {
118 unsigned long nr_freepages; /* Number of isolated free pages */ 118 unsigned long nr_freepages; /* Number of isolated free pages */
119 unsigned long nr_migratepages; /* Number of pages to migrate */ 119 unsigned long nr_migratepages; /* Number of pages to migrate */
120 unsigned long free_pfn; /* isolate_freepages search base */ 120 unsigned long free_pfn; /* isolate_freepages search base */
121 unsigned long start_free_pfn; /* where we started the search */
121 unsigned long migrate_pfn; /* isolate_migratepages search base */ 122 unsigned long migrate_pfn; /* isolate_migratepages search base */
122 bool sync; /* Synchronous migration */ 123 bool sync; /* Synchronous migration */
124 bool wrapped; /* Order > 0 compactions are
125 incremental, once free_pfn
126 and migrate_pfn meet, we restart
127 from the top of the zone;
128 remember we wrapped around. */
123 129
124 int order; /* order a direct compactor needs */ 130 int order; /* order a direct compactor needs */
125 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 131 int migratetype; /* MOVABLE, RECLAIMABLE etc */
@@ -347,3 +353,5 @@ extern u32 hwpoison_filter_enable;
347extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, 353extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
348 unsigned long, unsigned long, 354 unsigned long, unsigned long,
349 unsigned long, unsigned long); 355 unsigned long, unsigned long);
356
357extern void set_pageblock_order(void);
diff --git a/mm/memblock.c b/mm/memblock.c
index 5cc6731b00cc..4d9393c7edc9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -222,13 +222,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
222 /* Try to find some space for it. 222 /* Try to find some space for it.
223 * 223 *
224 * WARNING: We assume that either slab_is_available() and we use it or 224 * WARNING: We assume that either slab_is_available() and we use it or
225 * we use MEMBLOCK for allocations. That means that this is unsafe to use 225 * we use MEMBLOCK for allocations. That means that this is unsafe to
226 * when bootmem is currently active (unless bootmem itself is implemented 226 * use when bootmem is currently active (unless bootmem itself is
227 * on top of MEMBLOCK which isn't the case yet) 227 * implemented on top of MEMBLOCK which isn't the case yet)
228 * 228 *
229 * This should however not be an issue for now, as we currently only 229 * This should however not be an issue for now, as we currently only
230 * call into MEMBLOCK while it's still active, or much later when slab is 230 * call into MEMBLOCK while it's still active, or much later when slab
231 * active for memory hotplug operations 231 * is active for memory hotplug operations
232 */ 232 */
233 if (use_slab) { 233 if (use_slab) {
234 new_array = kmalloc(new_size, GFP_KERNEL); 234 new_array = kmalloc(new_size, GFP_KERNEL);
@@ -243,8 +243,8 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
243 new_alloc_size, PAGE_SIZE); 243 new_alloc_size, PAGE_SIZE);
244 if (!addr && new_area_size) 244 if (!addr && new_area_size)
245 addr = memblock_find_in_range(0, 245 addr = memblock_find_in_range(0,
246 min(new_area_start, memblock.current_limit), 246 min(new_area_start, memblock.current_limit),
247 new_alloc_size, PAGE_SIZE); 247 new_alloc_size, PAGE_SIZE);
248 248
249 new_array = addr ? __va(addr) : 0; 249 new_array = addr ? __va(addr) : 0;
250 } 250 }
@@ -254,12 +254,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
254 return -1; 254 return -1;
255 } 255 }
256 256
257 memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", 257 memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
258 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); 258 memblock_type_name(type), type->max * 2, (u64)addr,
259 (u64)addr + new_size - 1);
259 260
260 /* Found space, we now need to move the array over before 261 /*
261 * we add the reserved region since it may be our reserved 262 * Found space, we now need to move the array over before we add the
262 * array itself that is full. 263 * reserved region since it may be our reserved array itself that is
264 * full.
263 */ 265 */
264 memcpy(new_array, type->regions, old_size); 266 memcpy(new_array, type->regions, old_size);
265 memset(new_array + type->max, 0, old_size); 267 memset(new_array + type->max, 0, old_size);
@@ -267,17 +269,16 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
267 type->regions = new_array; 269 type->regions = new_array;
268 type->max <<= 1; 270 type->max <<= 1;
269 271
270 /* Free old array. We needn't free it if the array is the 272 /* Free old array. We needn't free it if the array is the static one */
271 * static one
272 */
273 if (*in_slab) 273 if (*in_slab)
274 kfree(old_array); 274 kfree(old_array);
275 else if (old_array != memblock_memory_init_regions && 275 else if (old_array != memblock_memory_init_regions &&
276 old_array != memblock_reserved_init_regions) 276 old_array != memblock_reserved_init_regions)
277 memblock_free(__pa(old_array), old_alloc_size); 277 memblock_free(__pa(old_array), old_alloc_size);
278 278
279 /* Reserve the new array if that comes from the memblock. 279 /*
280 * Otherwise, we needn't do it 280 * Reserve the new array if that comes from the memblock. Otherwise, we
281 * needn't do it
281 */ 282 */
282 if (!use_slab) 283 if (!use_slab)
283 BUG_ON(memblock_reserve(addr, new_alloc_size)); 284 BUG_ON(memblock_reserve(addr, new_alloc_size));
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f72b5e52451a..795e525afaba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,12 +61,12 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
61#define MEM_CGROUP_RECLAIM_RETRIES 5 61#define MEM_CGROUP_RECLAIM_RETRIES 5
62static struct mem_cgroup *root_mem_cgroup __read_mostly; 62static struct mem_cgroup *root_mem_cgroup __read_mostly;
63 63
64#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 64#ifdef CONFIG_MEMCG_SWAP
65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
66int do_swap_account __read_mostly; 66int do_swap_account __read_mostly;
67 67
68/* for remember boot option*/ 68/* for remember boot option*/
69#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 69#ifdef CONFIG_MEMCG_SWAP_ENABLED
70static int really_do_swap_account __initdata = 1; 70static int really_do_swap_account __initdata = 1;
71#else 71#else
72static int really_do_swap_account __initdata = 0; 72static int really_do_swap_account __initdata = 0;
@@ -87,7 +87,7 @@ enum mem_cgroup_stat_index {
87 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 87 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 90 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
91 MEM_CGROUP_STAT_NSTATS, 91 MEM_CGROUP_STAT_NSTATS,
92}; 92};
93 93
@@ -378,9 +378,7 @@ static bool move_file(void)
378 378
379enum charge_type { 379enum charge_type {
380 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 380 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
381 MEM_CGROUP_CHARGE_TYPE_MAPPED, 381 MEM_CGROUP_CHARGE_TYPE_ANON,
382 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
383 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
384 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 382 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
385 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 383 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
386 NR_CHARGE_TYPE, 384 NR_CHARGE_TYPE,
@@ -407,8 +405,14 @@ enum charge_type {
407static void mem_cgroup_get(struct mem_cgroup *memcg); 405static void mem_cgroup_get(struct mem_cgroup *memcg);
408static void mem_cgroup_put(struct mem_cgroup *memcg); 406static void mem_cgroup_put(struct mem_cgroup *memcg);
409 407
408static inline
409struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
410{
411 return container_of(s, struct mem_cgroup, css);
412}
413
410/* Writing them here to avoid exposing memcg's inner layout */ 414/* Writing them here to avoid exposing memcg's inner layout */
411#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 415#ifdef CONFIG_MEMCG_KMEM
412#include <net/sock.h> 416#include <net/sock.h>
413#include <net/ip.h> 417#include <net/ip.h>
414 418
@@ -467,9 +471,9 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
467} 471}
468EXPORT_SYMBOL(tcp_proto_cgroup); 472EXPORT_SYMBOL(tcp_proto_cgroup);
469#endif /* CONFIG_INET */ 473#endif /* CONFIG_INET */
470#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ 474#endif /* CONFIG_MEMCG_KMEM */
471 475
472#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) 476#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
473static void disarm_sock_keys(struct mem_cgroup *memcg) 477static void disarm_sock_keys(struct mem_cgroup *memcg)
474{ 478{
475 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) 479 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -703,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
703 bool charge) 707 bool charge)
704{ 708{
705 int val = (charge) ? 1 : -1; 709 int val = (charge) ? 1 : -1;
706 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 710 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
707} 711}
708 712
709static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 713static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
@@ -864,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
864 868
865struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 869struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
866{ 870{
867 return container_of(cgroup_subsys_state(cont, 871 return mem_cgroup_from_css(
868 mem_cgroup_subsys_id), struct mem_cgroup, 872 cgroup_subsys_state(cont, mem_cgroup_subsys_id));
869 css);
870} 873}
871 874
872struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 875struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -879,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
879 if (unlikely(!p)) 882 if (unlikely(!p))
880 return NULL; 883 return NULL;
881 884
882 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 885 return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
883 struct mem_cgroup, css);
884} 886}
885 887
886struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 888struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -966,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
966 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); 968 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
967 if (css) { 969 if (css) {
968 if (css == &root->css || css_tryget(css)) 970 if (css == &root->css || css_tryget(css))
969 memcg = container_of(css, 971 memcg = mem_cgroup_from_css(css);
970 struct mem_cgroup, css);
971 } else 972 } else
972 id = 0; 973 id = 0;
973 rcu_read_unlock(); 974 rcu_read_unlock();
@@ -1454,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1454/* 1455/*
1455 * Return the memory (and swap, if configured) limit for a memcg. 1456 * Return the memory (and swap, if configured) limit for a memcg.
1456 */ 1457 */
1457u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1458static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1458{ 1459{
1459 u64 limit; 1460 u64 limit;
1460 u64 memsw; 1461 u64 memsw;
@@ -1470,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1470 return min(limit, memsw); 1471 return min(limit, memsw);
1471} 1472}
1472 1473
1474void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1475 int order)
1476{
1477 struct mem_cgroup *iter;
1478 unsigned long chosen_points = 0;
1479 unsigned long totalpages;
1480 unsigned int points = 0;
1481 struct task_struct *chosen = NULL;
1482
1483 /*
1484 * If current has a pending SIGKILL, then automatically select it. The
1485 * goal is to allow it to allocate so that it may quickly exit and free
1486 * its memory.
1487 */
1488 if (fatal_signal_pending(current)) {
1489 set_thread_flag(TIF_MEMDIE);
1490 return;
1491 }
1492
1493 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1494 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1495 for_each_mem_cgroup_tree(iter, memcg) {
1496 struct cgroup *cgroup = iter->css.cgroup;
1497 struct cgroup_iter it;
1498 struct task_struct *task;
1499
1500 cgroup_iter_start(cgroup, &it);
1501 while ((task = cgroup_iter_next(cgroup, &it))) {
1502 switch (oom_scan_process_thread(task, totalpages, NULL,
1503 false)) {
1504 case OOM_SCAN_SELECT:
1505 if (chosen)
1506 put_task_struct(chosen);
1507 chosen = task;
1508 chosen_points = ULONG_MAX;
1509 get_task_struct(chosen);
1510 /* fall through */
1511 case OOM_SCAN_CONTINUE:
1512 continue;
1513 case OOM_SCAN_ABORT:
1514 cgroup_iter_end(cgroup, &it);
1515 mem_cgroup_iter_break(memcg, iter);
1516 if (chosen)
1517 put_task_struct(chosen);
1518 return;
1519 case OOM_SCAN_OK:
1520 break;
1521 };
1522 points = oom_badness(task, memcg, NULL, totalpages);
1523 if (points > chosen_points) {
1524 if (chosen)
1525 put_task_struct(chosen);
1526 chosen = task;
1527 chosen_points = points;
1528 get_task_struct(chosen);
1529 }
1530 }
1531 cgroup_iter_end(cgroup, &it);
1532 }
1533
1534 if (!chosen)
1535 return;
1536 points = chosen_points * 1000 / totalpages;
1537 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1538 NULL, "Memory cgroup out of memory");
1539}
1540
1473static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1541static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1474 gfp_t gfp_mask, 1542 gfp_t gfp_mask,
1475 unsigned long flags) 1543 unsigned long flags)
@@ -1899,7 +1967,7 @@ again:
1899 return; 1967 return;
1900 /* 1968 /*
1901 * If this memory cgroup is not under account moving, we don't 1969 * If this memory cgroup is not under account moving, we don't
1902 * need to take move_lock_page_cgroup(). Because we already hold 1970 * need to take move_lock_mem_cgroup(). Because we already hold
1903 * rcu_read_lock(), any calls to move_account will be delayed until 1971 * rcu_read_lock(), any calls to move_account will be delayed until
1904 * rcu_read_unlock() if mem_cgroup_stolen() == true. 1972 * rcu_read_unlock() if mem_cgroup_stolen() == true.
1905 */ 1973 */
@@ -1921,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
1921 /* 1989 /*
1922 * It's guaranteed that pc->mem_cgroup never changes while 1990 * It's guaranteed that pc->mem_cgroup never changes while
1923 * lock is held because a routine modifies pc->mem_cgroup 1991 * lock is held because a routine modifies pc->mem_cgroup
1924 * should take move_lock_page_cgroup(). 1992 * should take move_lock_mem_cgroup().
1925 */ 1993 */
1926 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 1994 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
1927} 1995}
@@ -2268,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2268 * We always charge the cgroup the mm_struct belongs to. 2336 * We always charge the cgroup the mm_struct belongs to.
2269 * The mm_struct's mem_cgroup changes on task migration if the 2337 * The mm_struct's mem_cgroup changes on task migration if the
2270 * thread group leader migrates. It's possible that mm is not 2338 * thread group leader migrates. It's possible that mm is not
2271 * set, if so charge the init_mm (happens for pagecache usage). 2339 * set, if so charge the root memcg (happens for pagecache usage).
2272 */ 2340 */
2273 if (!*ptr && !mm) 2341 if (!*ptr && !mm)
2274 *ptr = root_mem_cgroup; 2342 *ptr = root_mem_cgroup;
@@ -2429,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2429 css = css_lookup(&mem_cgroup_subsys, id); 2497 css = css_lookup(&mem_cgroup_subsys, id);
2430 if (!css) 2498 if (!css)
2431 return NULL; 2499 return NULL;
2432 return container_of(css, struct mem_cgroup, css); 2500 return mem_cgroup_from_css(css);
2433} 2501}
2434 2502
2435struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2503struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2473,11 +2541,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2473 bool anon; 2541 bool anon;
2474 2542
2475 lock_page_cgroup(pc); 2543 lock_page_cgroup(pc);
2476 if (unlikely(PageCgroupUsed(pc))) { 2544 VM_BUG_ON(PageCgroupUsed(pc));
2477 unlock_page_cgroup(pc);
2478 __mem_cgroup_cancel_charge(memcg, nr_pages);
2479 return;
2480 }
2481 /* 2545 /*
2482 * we don't need page_cgroup_lock about tail pages, becase they are not 2546 * we don't need page_cgroup_lock about tail pages, becase they are not
2483 * accessed by any other context at this point. 2547 * accessed by any other context at this point.
@@ -2519,7 +2583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2519 spin_unlock_irq(&zone->lru_lock); 2583 spin_unlock_irq(&zone->lru_lock);
2520 } 2584 }
2521 2585
2522 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 2586 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2523 anon = true; 2587 anon = true;
2524 else 2588 else
2525 anon = false; 2589 anon = false;
@@ -2644,8 +2708,7 @@ out:
2644 2708
2645static int mem_cgroup_move_parent(struct page *page, 2709static int mem_cgroup_move_parent(struct page *page,
2646 struct page_cgroup *pc, 2710 struct page_cgroup *pc,
2647 struct mem_cgroup *child, 2711 struct mem_cgroup *child)
2648 gfp_t gfp_mask)
2649{ 2712{
2650 struct mem_cgroup *parent; 2713 struct mem_cgroup *parent;
2651 unsigned int nr_pages; 2714 unsigned int nr_pages;
@@ -2728,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page,
2728 VM_BUG_ON(page->mapping && !PageAnon(page)); 2791 VM_BUG_ON(page->mapping && !PageAnon(page));
2729 VM_BUG_ON(!mm); 2792 VM_BUG_ON(!mm);
2730 return mem_cgroup_charge_common(page, mm, gfp_mask, 2793 return mem_cgroup_charge_common(page, mm, gfp_mask,
2731 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2794 MEM_CGROUP_CHARGE_TYPE_ANON);
2732}
2733
2734static void
2735__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2736 enum charge_type ctype);
2737
2738int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2739 gfp_t gfp_mask)
2740{
2741 struct mem_cgroup *memcg = NULL;
2742 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2743 int ret;
2744
2745 if (mem_cgroup_disabled())
2746 return 0;
2747 if (PageCompound(page))
2748 return 0;
2749
2750 if (unlikely(!mm))
2751 mm = &init_mm;
2752 if (!page_is_file_cache(page))
2753 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2754
2755 if (!PageSwapCache(page))
2756 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2757 else { /* page is swapcache/shmem */
2758 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2759 if (!ret)
2760 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2761 }
2762 return ret;
2763} 2795}
2764 2796
2765/* 2797/*
@@ -2768,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2768 * struct page_cgroup is acquired. This refcnt will be consumed by 2800 * struct page_cgroup is acquired. This refcnt will be consumed by
2769 * "commit()" or removed by "cancel()" 2801 * "commit()" or removed by "cancel()"
2770 */ 2802 */
2771int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2803static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2772 struct page *page, 2804 struct page *page,
2773 gfp_t mask, struct mem_cgroup **memcgp) 2805 gfp_t mask,
2806 struct mem_cgroup **memcgp)
2774{ 2807{
2775 struct mem_cgroup *memcg; 2808 struct mem_cgroup *memcg;
2809 struct page_cgroup *pc;
2776 int ret; 2810 int ret;
2777 2811
2778 *memcgp = NULL; 2812 pc = lookup_page_cgroup(page);
2779
2780 if (mem_cgroup_disabled())
2781 return 0;
2782
2783 if (!do_swap_account)
2784 goto charge_cur_mm;
2785 /* 2813 /*
2786 * A racing thread's fault, or swapoff, may have already updated 2814 * Every swap fault against a single page tries to charge the
2787 * the pte, and even removed page from swap cache: in those cases 2815 * page, bail as early as possible. shmem_unuse() encounters
2788 * do_swap_page()'s pte_same() test will fail; but there's also a 2816 * already charged pages, too. The USED bit is protected by
2789 * KSM case which does need to charge the page. 2817 * the page lock, which serializes swap cache removal, which
2818 * in turn serializes uncharging.
2790 */ 2819 */
2791 if (!PageSwapCache(page)) 2820 if (PageCgroupUsed(pc))
2821 return 0;
2822 if (!do_swap_account)
2792 goto charge_cur_mm; 2823 goto charge_cur_mm;
2793 memcg = try_get_mem_cgroup_from_page(page); 2824 memcg = try_get_mem_cgroup_from_page(page);
2794 if (!memcg) 2825 if (!memcg)
@@ -2800,14 +2831,44 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2800 ret = 0; 2831 ret = 0;
2801 return ret; 2832 return ret;
2802charge_cur_mm: 2833charge_cur_mm:
2803 if (unlikely(!mm))
2804 mm = &init_mm;
2805 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 2834 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
2806 if (ret == -EINTR) 2835 if (ret == -EINTR)
2807 ret = 0; 2836 ret = 0;
2808 return ret; 2837 return ret;
2809} 2838}
2810 2839
2840int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
2841 gfp_t gfp_mask, struct mem_cgroup **memcgp)
2842{
2843 *memcgp = NULL;
2844 if (mem_cgroup_disabled())
2845 return 0;
2846 /*
2847 * A racing thread's fault, or swapoff, may have already
2848 * updated the pte, and even removed page from swap cache: in
2849 * those cases unuse_pte()'s pte_same() test will fail; but
2850 * there's also a KSM case which does need to charge the page.
2851 */
2852 if (!PageSwapCache(page)) {
2853 int ret;
2854
2855 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
2856 if (ret == -EINTR)
2857 ret = 0;
2858 return ret;
2859 }
2860 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
2861}
2862
2863void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
2864{
2865 if (mem_cgroup_disabled())
2866 return;
2867 if (!memcg)
2868 return;
2869 __mem_cgroup_cancel_charge(memcg, 1);
2870}
2871
2811static void 2872static void
2812__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 2873__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2813 enum charge_type ctype) 2874 enum charge_type ctype)
@@ -2842,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
2842 struct mem_cgroup *memcg) 2903 struct mem_cgroup *memcg)
2843{ 2904{
2844 __mem_cgroup_commit_charge_swapin(page, memcg, 2905 __mem_cgroup_commit_charge_swapin(page, memcg,
2845 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2906 MEM_CGROUP_CHARGE_TYPE_ANON);
2846} 2907}
2847 2908
2848void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 2909int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2910 gfp_t gfp_mask)
2849{ 2911{
2912 struct mem_cgroup *memcg = NULL;
2913 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2914 int ret;
2915
2850 if (mem_cgroup_disabled()) 2916 if (mem_cgroup_disabled())
2851 return; 2917 return 0;
2852 if (!memcg) 2918 if (PageCompound(page))
2853 return; 2919 return 0;
2854 __mem_cgroup_cancel_charge(memcg, 1); 2920
2921 if (!PageSwapCache(page))
2922 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2923 else { /* page is swapcache/shmem */
2924 ret = __mem_cgroup_try_charge_swapin(mm, page,
2925 gfp_mask, &memcg);
2926 if (!ret)
2927 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2928 }
2929 return ret;
2855} 2930}
2856 2931
2857static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 2932static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -2911,7 +2986,8 @@ direct_uncharge:
2911 * uncharge if !page_mapped(page) 2986 * uncharge if !page_mapped(page)
2912 */ 2987 */
2913static struct mem_cgroup * 2988static struct mem_cgroup *
2914__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2989__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
2990 bool end_migration)
2915{ 2991{
2916 struct mem_cgroup *memcg = NULL; 2992 struct mem_cgroup *memcg = NULL;
2917 unsigned int nr_pages = 1; 2993 unsigned int nr_pages = 1;
@@ -2921,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2921 if (mem_cgroup_disabled()) 2997 if (mem_cgroup_disabled())
2922 return NULL; 2998 return NULL;
2923 2999
2924 if (PageSwapCache(page)) 3000 VM_BUG_ON(PageSwapCache(page));
2925 return NULL;
2926 3001
2927 if (PageTransHuge(page)) { 3002 if (PageTransHuge(page)) {
2928 nr_pages <<= compound_order(page); 3003 nr_pages <<= compound_order(page);
@@ -2945,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2945 anon = PageAnon(page); 3020 anon = PageAnon(page);
2946 3021
2947 switch (ctype) { 3022 switch (ctype) {
2948 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 3023 case MEM_CGROUP_CHARGE_TYPE_ANON:
2949 /* 3024 /*
2950 * Generally PageAnon tells if it's the anon statistics to be 3025 * Generally PageAnon tells if it's the anon statistics to be
2951 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is 3026 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
@@ -2955,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2955 /* fallthrough */ 3030 /* fallthrough */
2956 case MEM_CGROUP_CHARGE_TYPE_DROP: 3031 case MEM_CGROUP_CHARGE_TYPE_DROP:
2957 /* See mem_cgroup_prepare_migration() */ 3032 /* See mem_cgroup_prepare_migration() */
2958 if (page_mapped(page) || PageCgroupMigration(pc)) 3033 if (page_mapped(page))
3034 goto unlock_out;
3035 /*
3036 * Pages under migration may not be uncharged. But
3037 * end_migration() /must/ be the one uncharging the
3038 * unused post-migration page and so it has to call
3039 * here with the migration bit still set. See the
3040 * res_counter handling below.
3041 */
3042 if (!end_migration && PageCgroupMigration(pc))
2959 goto unlock_out; 3043 goto unlock_out;
2960 break; 3044 break;
2961 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 3045 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2989,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2989 mem_cgroup_swap_statistics(memcg, true); 3073 mem_cgroup_swap_statistics(memcg, true);
2990 mem_cgroup_get(memcg); 3074 mem_cgroup_get(memcg);
2991 } 3075 }
2992 if (!mem_cgroup_is_root(memcg)) 3076 /*
3077 * Migration does not charge the res_counter for the
3078 * replacement page, so leave it alone when phasing out the
3079 * page that is unused after the migration.
3080 */
3081 if (!end_migration && !mem_cgroup_is_root(memcg))
2993 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 3082 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
2994 3083
2995 return memcg; 3084 return memcg;
@@ -3005,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page)
3005 if (page_mapped(page)) 3094 if (page_mapped(page))
3006 return; 3095 return;
3007 VM_BUG_ON(page->mapping && !PageAnon(page)); 3096 VM_BUG_ON(page->mapping && !PageAnon(page));
3008 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 3097 if (PageSwapCache(page))
3098 return;
3099 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
3009} 3100}
3010 3101
3011void mem_cgroup_uncharge_cache_page(struct page *page) 3102void mem_cgroup_uncharge_cache_page(struct page *page)
3012{ 3103{
3013 VM_BUG_ON(page_mapped(page)); 3104 VM_BUG_ON(page_mapped(page));
3014 VM_BUG_ON(page->mapping); 3105 VM_BUG_ON(page->mapping);
3015 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 3106 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
3016} 3107}
3017 3108
3018/* 3109/*
@@ -3076,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3076 if (!swapout) /* this was a swap cache but the swap is unused ! */ 3167 if (!swapout) /* this was a swap cache but the swap is unused ! */
3077 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 3168 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3078 3169
3079 memcg = __mem_cgroup_uncharge_common(page, ctype); 3170 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
3080 3171
3081 /* 3172 /*
3082 * record memcg information, if swapout && memcg != NULL, 3173 * record memcg information, if swapout && memcg != NULL,
@@ -3087,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3087} 3178}
3088#endif 3179#endif
3089 3180
3090#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3181#ifdef CONFIG_MEMCG_SWAP
3091/* 3182/*
3092 * called from swap_entry_free(). remove record in swap_cgroup and 3183 * called from swap_entry_free(). remove record in swap_cgroup and
3093 * uncharge "memsw" account. 3184 * uncharge "memsw" account.
@@ -3166,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3166 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 3257 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
3167 * page belongs to. 3258 * page belongs to.
3168 */ 3259 */
3169int mem_cgroup_prepare_migration(struct page *page, 3260void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
3170 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) 3261 struct mem_cgroup **memcgp)
3171{ 3262{
3172 struct mem_cgroup *memcg = NULL; 3263 struct mem_cgroup *memcg = NULL;
3173 struct page_cgroup *pc; 3264 struct page_cgroup *pc;
3174 enum charge_type ctype; 3265 enum charge_type ctype;
3175 int ret = 0;
3176 3266
3177 *memcgp = NULL; 3267 *memcgp = NULL;
3178 3268
3179 VM_BUG_ON(PageTransHuge(page)); 3269 VM_BUG_ON(PageTransHuge(page));
3180 if (mem_cgroup_disabled()) 3270 if (mem_cgroup_disabled())
3181 return 0; 3271 return;
3182 3272
3183 pc = lookup_page_cgroup(page); 3273 pc = lookup_page_cgroup(page);
3184 lock_page_cgroup(pc); 3274 lock_page_cgroup(pc);
@@ -3223,24 +3313,9 @@ int mem_cgroup_prepare_migration(struct page *page,
3223 * we return here. 3313 * we return here.
3224 */ 3314 */
3225 if (!memcg) 3315 if (!memcg)
3226 return 0; 3316 return;
3227 3317
3228 *memcgp = memcg; 3318 *memcgp = memcg;
3229 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
3230 css_put(&memcg->css);/* drop extra refcnt */
3231 if (ret) {
3232 if (PageAnon(page)) {
3233 lock_page_cgroup(pc);
3234 ClearPageCgroupMigration(pc);
3235 unlock_page_cgroup(pc);
3236 /*
3237 * The old page may be fully unmapped while we kept it.
3238 */
3239 mem_cgroup_uncharge_page(page);
3240 }
3241 /* we'll need to revisit this error code (we have -EINTR) */
3242 return -ENOMEM;
3243 }
3244 /* 3319 /*
3245 * We charge new page before it's used/mapped. So, even if unlock_page() 3320 * We charge new page before it's used/mapped. So, even if unlock_page()
3246 * is called before end_migration, we can catch all events on this new 3321 * is called before end_migration, we can catch all events on this new
@@ -3248,13 +3323,15 @@ int mem_cgroup_prepare_migration(struct page *page,
3248 * mapcount will be finally 0 and we call uncharge in end_migration(). 3323 * mapcount will be finally 0 and we call uncharge in end_migration().
3249 */ 3324 */
3250 if (PageAnon(page)) 3325 if (PageAnon(page))
3251 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 3326 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
3252 else if (page_is_file_cache(page))
3253 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3254 else 3327 else
3255 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3328 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3329 /*
3330 * The page is committed to the memcg, but it's not actually
3331 * charged to the res_counter since we plan on replacing the
3332 * old one and only one page is going to be left afterwards.
3333 */
3256 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); 3334 __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
3257 return ret;
3258} 3335}
3259 3336
3260/* remove redundant charge if migration failed*/ 3337/* remove redundant charge if migration failed*/
@@ -3276,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3276 used = newpage; 3353 used = newpage;
3277 unused = oldpage; 3354 unused = oldpage;
3278 } 3355 }
3356 anon = PageAnon(used);
3357 __mem_cgroup_uncharge_common(unused,
3358 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
3359 : MEM_CGROUP_CHARGE_TYPE_CACHE,
3360 true);
3361 css_put(&memcg->css);
3279 /* 3362 /*
3280 * We disallowed uncharge of pages under migration because mapcount 3363 * We disallowed uncharge of pages under migration because mapcount
3281 * of the page goes down to zero, temporarly. 3364 * of the page goes down to zero, temporarly.
@@ -3285,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3285 lock_page_cgroup(pc); 3368 lock_page_cgroup(pc);
3286 ClearPageCgroupMigration(pc); 3369 ClearPageCgroupMigration(pc);
3287 unlock_page_cgroup(pc); 3370 unlock_page_cgroup(pc);
3288 anon = PageAnon(used);
3289 __mem_cgroup_uncharge_common(unused,
3290 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
3291 : MEM_CGROUP_CHARGE_TYPE_CACHE);
3292 3371
3293 /* 3372 /*
3294 * If a page is a file cache, radix-tree replacement is very atomic 3373 * If a page is a file cache, radix-tree replacement is very atomic
@@ -3340,10 +3419,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3340 */ 3419 */
3341 if (!memcg) 3420 if (!memcg)
3342 return; 3421 return;
3343
3344 if (PageSwapBacked(oldpage))
3345 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3346
3347 /* 3422 /*
3348 * Even if newpage->mapping was NULL before starting replacement, 3423 * Even if newpage->mapping was NULL before starting replacement,
3349 * the newpage may be on LRU(or pagevec for LRU) already. We lock 3424 * the newpage may be on LRU(or pagevec for LRU) already. We lock
@@ -3418,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3418 /* 3493 /*
3419 * Rather than hide all in some function, I do this in 3494 * Rather than hide all in some function, I do this in
3420 * open coded manner. You see what this really does. 3495 * open coded manner. You see what this really does.
3421 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3496 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3422 */ 3497 */
3423 mutex_lock(&set_limit_mutex); 3498 mutex_lock(&set_limit_mutex);
3424 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3499 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3479,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3479 /* 3554 /*
3480 * Rather than hide all in some function, I do this in 3555 * Rather than hide all in some function, I do this in
3481 * open coded manner. You see what this really does. 3556 * open coded manner. You see what this really does.
3482 * We have to guarantee memcg->res.limit < memcg->memsw.limit. 3557 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
3483 */ 3558 */
3484 mutex_lock(&set_limit_mutex); 3559 mutex_lock(&set_limit_mutex);
3485 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3560 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3611,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3611} 3686}
3612 3687
3613/* 3688/*
3614 * This routine traverse page_cgroup in given list and drop them all. 3689 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3615 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3690 * reclaim the pages page themselves - it just removes the page_cgroups.
3691 * Returns true if some page_cgroups were not freed, indicating that the caller
3692 * must retry this operation.
3616 */ 3693 */
3617static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3694static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3618 int node, int zid, enum lru_list lru) 3695 int node, int zid, enum lru_list lru)
3619{ 3696{
3620 struct mem_cgroup_per_zone *mz; 3697 struct mem_cgroup_per_zone *mz;
@@ -3622,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3622 struct list_head *list; 3699 struct list_head *list;
3623 struct page *busy; 3700 struct page *busy;
3624 struct zone *zone; 3701 struct zone *zone;
3625 int ret = 0;
3626 3702
3627 zone = &NODE_DATA(node)->node_zones[zid]; 3703 zone = &NODE_DATA(node)->node_zones[zid];
3628 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3704 mz = mem_cgroup_zoneinfo(memcg, node, zid);
@@ -3636,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3636 struct page_cgroup *pc; 3712 struct page_cgroup *pc;
3637 struct page *page; 3713 struct page *page;
3638 3714
3639 ret = 0;
3640 spin_lock_irqsave(&zone->lru_lock, flags); 3715 spin_lock_irqsave(&zone->lru_lock, flags);
3641 if (list_empty(list)) { 3716 if (list_empty(list)) {
3642 spin_unlock_irqrestore(&zone->lru_lock, flags); 3717 spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -3653,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3653 3728
3654 pc = lookup_page_cgroup(page); 3729 pc = lookup_page_cgroup(page);
3655 3730
3656 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); 3731 if (mem_cgroup_move_parent(page, pc, memcg)) {
3657 if (ret == -ENOMEM || ret == -EINTR)
3658 break;
3659
3660 if (ret == -EBUSY || ret == -EINVAL) {
3661 /* found lock contention or "pc" is obsolete. */ 3732 /* found lock contention or "pc" is obsolete. */
3662 busy = page; 3733 busy = page;
3663 cond_resched(); 3734 cond_resched();
3664 } else 3735 } else
3665 busy = NULL; 3736 busy = NULL;
3666 } 3737 }
3667 3738 return !list_empty(list);
3668 if (!ret && !list_empty(list))
3669 return -EBUSY;
3670 return ret;
3671} 3739}
3672 3740
3673/* 3741/*
@@ -3692,9 +3760,6 @@ move_account:
3692 ret = -EBUSY; 3760 ret = -EBUSY;
3693 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3761 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3694 goto out; 3762 goto out;
3695 ret = -EINTR;
3696 if (signal_pending(current))
3697 goto out;
3698 /* This is for making all *used* pages to be on LRU. */ 3763 /* This is for making all *used* pages to be on LRU. */
3699 lru_add_drain_all(); 3764 lru_add_drain_all();
3700 drain_all_stock_sync(memcg); 3765 drain_all_stock_sync(memcg);
@@ -3715,9 +3780,6 @@ move_account:
3715 } 3780 }
3716 mem_cgroup_end_move(memcg); 3781 mem_cgroup_end_move(memcg);
3717 memcg_oom_recover(memcg); 3782 memcg_oom_recover(memcg);
3718 /* it seems parent cgroup doesn't have enough mem */
3719 if (ret == -ENOMEM)
3720 goto try_to_free;
3721 cond_resched(); 3783 cond_resched();
3722 /* "ret" should also be checked to ensure all lists are empty. */ 3784 /* "ret" should also be checked to ensure all lists are empty. */
3723 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); 3785 } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
@@ -3779,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3779 parent_memcg = mem_cgroup_from_cont(parent); 3841 parent_memcg = mem_cgroup_from_cont(parent);
3780 3842
3781 cgroup_lock(); 3843 cgroup_lock();
3844
3845 if (memcg->use_hierarchy == val)
3846 goto out;
3847
3782 /* 3848 /*
3783 * If parent's use_hierarchy is set, we can't make any modifications 3849 * If parent's use_hierarchy is set, we can't make any modifications
3784 * in the child subtrees. If it is unset, then the change can 3850 * in the child subtrees. If it is unset, then the change can
@@ -3795,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3795 retval = -EBUSY; 3861 retval = -EBUSY;
3796 } else 3862 } else
3797 retval = -EINVAL; 3863 retval = -EINVAL;
3864
3865out:
3798 cgroup_unlock(); 3866 cgroup_unlock();
3799 3867
3800 return retval; 3868 return retval;
@@ -3831,7 +3899,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3831 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 3899 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
3832 3900
3833 if (swap) 3901 if (swap)
3834 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); 3902 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
3835 3903
3836 return val << PAGE_SHIFT; 3904 return val << PAGE_SHIFT;
3837} 3905}
@@ -4015,7 +4083,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4015#endif 4083#endif
4016 4084
4017#ifdef CONFIG_NUMA 4085#ifdef CONFIG_NUMA
4018static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft, 4086static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4019 struct seq_file *m) 4087 struct seq_file *m)
4020{ 4088{
4021 int nid; 4089 int nid;
@@ -4074,7 +4142,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
4074 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 4142 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
4075} 4143}
4076 4144
4077static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4145static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
4078 struct seq_file *m) 4146 struct seq_file *m)
4079{ 4147{
4080 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4148 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
@@ -4082,7 +4150,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4082 unsigned int i; 4150 unsigned int i;
4083 4151
4084 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4152 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4085 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) 4153 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4086 continue; 4154 continue;
4087 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 4155 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
4088 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 4156 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
@@ -4109,7 +4177,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4109 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4177 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4110 long long val = 0; 4178 long long val = 0;
4111 4179
4112 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) 4180 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4113 continue; 4181 continue;
4114 for_each_mem_cgroup_tree(mi, memcg) 4182 for_each_mem_cgroup_tree(mi, memcg)
4115 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 4183 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
@@ -4533,7 +4601,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4533 return 0; 4601 return 0;
4534} 4602}
4535 4603
4536#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 4604#ifdef CONFIG_MEMCG_KMEM
4537static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4605static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4538{ 4606{
4539 return mem_cgroup_sockets_init(memcg, ss); 4607 return mem_cgroup_sockets_init(memcg, ss);
@@ -4588,7 +4656,7 @@ static struct cftype mem_cgroup_files[] = {
4588 }, 4656 },
4589 { 4657 {
4590 .name = "stat", 4658 .name = "stat",
4591 .read_seq_string = mem_control_stat_show, 4659 .read_seq_string = memcg_stat_show,
4592 }, 4660 },
4593 { 4661 {
4594 .name = "force_empty", 4662 .name = "force_empty",
@@ -4620,10 +4688,10 @@ static struct cftype mem_cgroup_files[] = {
4620#ifdef CONFIG_NUMA 4688#ifdef CONFIG_NUMA
4621 { 4689 {
4622 .name = "numa_stat", 4690 .name = "numa_stat",
4623 .read_seq_string = mem_control_numa_stat_show, 4691 .read_seq_string = memcg_numa_stat_show,
4624 }, 4692 },
4625#endif 4693#endif
4626#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4694#ifdef CONFIG_MEMCG_SWAP
4627 { 4695 {
4628 .name = "memsw.usage_in_bytes", 4696 .name = "memsw.usage_in_bytes",
4629 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4697 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
@@ -4810,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4810} 4878}
4811EXPORT_SYMBOL(parent_mem_cgroup); 4879EXPORT_SYMBOL(parent_mem_cgroup);
4812 4880
4813#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4881#ifdef CONFIG_MEMCG_SWAP
4814static void __init enable_swap_cgroup(void) 4882static void __init enable_swap_cgroup(void)
4815{ 4883{
4816 if (!mem_cgroup_disabled() && really_do_swap_account) 4884 if (!mem_cgroup_disabled() && really_do_swap_account)
@@ -5541,7 +5609,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
5541 .__DEPRECATED_clear_css_refs = true, 5609 .__DEPRECATED_clear_css_refs = true,
5542}; 5610};
5543 5611
5544#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5612#ifdef CONFIG_MEMCG_SWAP
5545static int __init enable_swap_account(char *s) 5613static int __init enable_swap_account(char *s)
5546{ 5614{
5547 /* consider enabled if no parameter or 1 is given */ 5615 /* consider enabled if no parameter or 1 is given */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6de0d613bbe6..a6e2141a6610 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)
128 * can only guarantee that the page either belongs to the memcg tasks, or is 128 * can only guarantee that the page either belongs to the memcg tasks, or is
129 * a freed page. 129 * a freed page.
130 */ 130 */
131#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 131#ifdef CONFIG_MEMCG_SWAP
132u64 hwpoison_filter_memcg; 132u64 hwpoison_filter_memcg;
133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); 133EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
134static int hwpoison_filter_task(struct page *p) 134static int hwpoison_filter_task(struct page *p)
@@ -1416,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
1416 int ret; 1416 int ret;
1417 unsigned long pfn = page_to_pfn(page); 1417 unsigned long pfn = page_to_pfn(page);
1418 struct page *hpage = compound_head(page); 1418 struct page *hpage = compound_head(page);
1419 LIST_HEAD(pagelist);
1420 1419
1421 ret = get_any_page(page, pfn, flags); 1420 ret = get_any_page(page, pfn, flags);
1422 if (ret < 0) 1421 if (ret < 0)
@@ -1431,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
1431 } 1430 }
1432 1431
1433 /* Keep page count to indicate a given hugepage is isolated. */ 1432 /* Keep page count to indicate a given hugepage is isolated. */
1434 1433 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
1435 list_add(&hpage->lru, &pagelist);
1436 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false,
1437 MIGRATE_SYNC); 1434 MIGRATE_SYNC);
1435 put_page(hpage);
1438 if (ret) { 1436 if (ret) {
1439 struct page *page1, *page2;
1440 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1441 put_page(page1);
1442
1443 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1437 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1444 pfn, ret, page->flags); 1438 pfn, ret, page->flags);
1445 if (ret > 0)
1446 ret = -EIO;
1447 return ret; 1439 return ret;
1448 } 1440 }
1449done: 1441done:
1450 if (!PageHWPoison(hpage)) 1442 if (!PageHWPoison(hpage))
1451 atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); 1443 atomic_long_add(1 << compound_trans_order(hpage),
1444 &mce_bad_pages);
1452 set_page_hwpoison_huge_page(hpage); 1445 set_page_hwpoison_huge_page(hpage);
1453 dequeue_hwpoisoned_huge_page(hpage); 1446 dequeue_hwpoisoned_huge_page(hpage);
1454 /* keep elevated page count for bad page */ 1447 /* keep elevated page count for bad page */
diff --git a/mm/memory.c b/mm/memory.c
index 91f69459d3e8..482f089765ff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1343,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1343 * Since no pte has actually been setup, it is 1343 * Since no pte has actually been setup, it is
1344 * safe to do nothing in this case. 1344 * safe to do nothing in this case.
1345 */ 1345 */
1346 if (vma->vm_file) 1346 if (vma->vm_file) {
1347 unmap_hugepage_range(vma, start, end, NULL); 1347 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
1348 __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
1349 mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
1350 }
1348 } else 1351 } else
1349 unmap_page_range(tlb, vma, start, end, details); 1352 unmap_page_range(tlb, vma, start, end, details);
1350 } 1353 }
@@ -3938,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
3938 free_page((unsigned long)buf); 3941 free_page((unsigned long)buf);
3939 } 3942 }
3940 } 3943 }
3941 up_read(&current->mm->mmap_sem); 3944 up_read(&mm->mmap_sem);
3942} 3945}
3943 3946
3944#ifdef CONFIG_PROVE_LOCKING 3947#ifdef CONFIG_PROVE_LOCKING
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 427bb291dd0f..3ad25f9d1fc1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -512,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
512 512
513 zone->present_pages += onlined_pages; 513 zone->present_pages += onlined_pages;
514 zone->zone_pgdat->node_present_pages += onlined_pages; 514 zone->zone_pgdat->node_present_pages += onlined_pages;
515 if (need_zonelists_rebuild) 515 if (onlined_pages) {
516 build_all_zonelists(zone); 516 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
517 else 517 if (need_zonelists_rebuild)
518 zone_pcp_update(zone); 518 build_all_zonelists(NULL, zone);
519 else
520 zone_pcp_update(zone);
521 }
519 522
520 mutex_unlock(&zonelists_mutex); 523 mutex_unlock(&zonelists_mutex);
521 524
522 init_per_zone_wmark_min(); 525 init_per_zone_wmark_min();
523 526
524 if (onlined_pages) { 527 if (onlined_pages)
525 kswapd_run(zone_to_nid(zone)); 528 kswapd_run(zone_to_nid(zone));
526 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
527 }
528 529
529 vm_total_pages = nr_free_pagecache_pages(); 530 vm_total_pages = nr_free_pagecache_pages();
530 531
@@ -562,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
562 * to access not-initialized zonelist, build here. 563 * to access not-initialized zonelist, build here.
563 */ 564 */
564 mutex_lock(&zonelists_mutex); 565 mutex_lock(&zonelists_mutex);
565 build_all_zonelists(NULL); 566 build_all_zonelists(pgdat, NULL);
566 mutex_unlock(&zonelists_mutex); 567 mutex_unlock(&zonelists_mutex);
567 568
568 return pgdat; 569 return pgdat;
@@ -965,6 +966,9 @@ repeat:
965 966
966 init_per_zone_wmark_min(); 967 init_per_zone_wmark_min();
967 968
969 if (!populated_zone(zone))
970 zone_pcp_reset(zone);
971
968 if (!node_present_pages(node)) { 972 if (!node_present_pages(node)) {
969 node_clear_state(node, N_HIGH_MEMORY); 973 node_clear_state(node, N_HIGH_MEMORY);
970 kswapd_stop(node); 974 kswapd_stop(node);
diff --git a/mm/migrate.c b/mm/migrate.c
index be26d5cbe56b..77ed2d773705 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -33,6 +33,7 @@
33#include <linux/memcontrol.h> 33#include <linux/memcontrol.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/hugetlb_cgroup.h>
36#include <linux/gfp.h> 37#include <linux/gfp.h>
37 38
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
@@ -682,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
682{ 683{
683 int rc = -EAGAIN; 684 int rc = -EAGAIN;
684 int remap_swapcache = 1; 685 int remap_swapcache = 1;
685 int charge = 0;
686 struct mem_cgroup *mem; 686 struct mem_cgroup *mem;
687 struct anon_vma *anon_vma = NULL; 687 struct anon_vma *anon_vma = NULL;
688 688
@@ -724,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
724 } 724 }
725 725
726 /* charge against new page */ 726 /* charge against new page */
727 charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); 727 mem_cgroup_prepare_migration(page, newpage, &mem);
728 if (charge == -ENOMEM) {
729 rc = -ENOMEM;
730 goto unlock;
731 }
732 BUG_ON(charge);
733 728
734 if (PageWriteback(page)) { 729 if (PageWriteback(page)) {
735 /* 730 /*
@@ -819,8 +814,7 @@ skip_unmap:
819 put_anon_vma(anon_vma); 814 put_anon_vma(anon_vma);
820 815
821uncharge: 816uncharge:
822 if (!charge) 817 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
823 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
824unlock: 818unlock:
825 unlock_page(page); 819 unlock_page(page);
826out: 820out:
@@ -931,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
931 925
932 if (anon_vma) 926 if (anon_vma)
933 put_anon_vma(anon_vma); 927 put_anon_vma(anon_vma);
934 unlock_page(hpage);
935 928
936out: 929 if (!rc)
937 if (rc != -EAGAIN) { 930 hugetlb_cgroup_migrate(hpage, new_hpage);
938 list_del(&hpage->lru);
939 put_page(hpage);
940 }
941 931
932 unlock_page(hpage);
933out:
942 put_page(new_hpage); 934 put_page(new_hpage);
943
944 if (result) { 935 if (result) {
945 if (rc) 936 if (rc)
946 *result = rc; 937 *result = rc;
@@ -1016,48 +1007,32 @@ out:
1016 return nr_failed + retry; 1007 return nr_failed + retry;
1017} 1008}
1018 1009
1019int migrate_huge_pages(struct list_head *from, 1010int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1020 new_page_t get_new_page, unsigned long private, bool offlining, 1011 unsigned long private, bool offlining,
1021 enum migrate_mode mode) 1012 enum migrate_mode mode)
1022{ 1013{
1023 int retry = 1; 1014 int pass, rc;
1024 int nr_failed = 0; 1015
1025 int pass = 0; 1016 for (pass = 0; pass < 10; pass++) {
1026 struct page *page; 1017 rc = unmap_and_move_huge_page(get_new_page,
1027 struct page *page2; 1018 private, hpage, pass > 2, offlining,
1028 int rc; 1019 mode);
1029 1020 switch (rc) {
1030 for (pass = 0; pass < 10 && retry; pass++) { 1021 case -ENOMEM:
1031 retry = 0; 1022 goto out;
1032 1023 case -EAGAIN:
1033 list_for_each_entry_safe(page, page2, from, lru) { 1024 /* try again */
1034 cond_resched(); 1025 cond_resched();
1035 1026 break;
1036 rc = unmap_and_move_huge_page(get_new_page, 1027 case 0:
1037 private, page, pass > 2, offlining, 1028 goto out;
1038 mode); 1029 default:
1039 1030 rc = -EIO;
1040 switch(rc) { 1031 goto out;
1041 case -ENOMEM:
1042 goto out;
1043 case -EAGAIN:
1044 retry++;
1045 break;
1046 case 0:
1047 break;
1048 default:
1049 /* Permanent failure */
1050 nr_failed++;
1051 break;
1052 }
1053 } 1032 }
1054 } 1033 }
1055 rc = 0;
1056out: 1034out:
1057 if (rc) 1035 return rc;
1058 return rc;
1059
1060 return nr_failed + retry;
1061} 1036}
1062 1037
1063#ifdef CONFIG_NUMA 1038#ifdef CONFIG_NUMA
diff --git a/mm/mmap.c b/mm/mmap.c
index 4fe2697339ed..e3e86914f11a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -943,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
943 const unsigned long stack_flags 943 const unsigned long stack_flags
944 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); 944 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
945 945
946 mm->total_vm += pages;
947
946 if (file) { 948 if (file) {
947 mm->shared_vm += pages; 949 mm->shared_vm += pages;
948 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) 950 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
@@ -1347,7 +1349,6 @@ munmap_back:
1347out: 1349out:
1348 perf_event_mmap(vma); 1350 perf_event_mmap(vma);
1349 1351
1350 mm->total_vm += len >> PAGE_SHIFT;
1351 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1352 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1352 if (vm_flags & VM_LOCKED) { 1353 if (vm_flags & VM_LOCKED) {
1353 if (!mlock_vma_pages_range(vma, addr, addr + len)) 1354 if (!mlock_vma_pages_range(vma, addr, addr + len))
@@ -1707,7 +1708,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
1707 return -ENOMEM; 1708 return -ENOMEM;
1708 1709
1709 /* Ok, everything looks good - let it rip */ 1710 /* Ok, everything looks good - let it rip */
1710 mm->total_vm += grow;
1711 if (vma->vm_flags & VM_LOCKED) 1711 if (vma->vm_flags & VM_LOCKED)
1712 mm->locked_vm += grow; 1712 mm->locked_vm += grow;
1713 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); 1713 vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
@@ -1889,7 +1889,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1889 1889
1890 if (vma->vm_flags & VM_ACCOUNT) 1890 if (vma->vm_flags & VM_ACCOUNT)
1891 nr_accounted += nrpages; 1891 nr_accounted += nrpages;
1892 mm->total_vm -= nrpages;
1893 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 1892 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1894 vma = remove_vma(vma); 1893 vma = remove_vma(vma);
1895 } while (vma); 1894 } while (vma);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9a611d3a1848..862b60822d9f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
33void __mmu_notifier_release(struct mm_struct *mm) 33void __mmu_notifier_release(struct mm_struct *mm)
34{ 34{
35 struct mmu_notifier *mn; 35 struct mmu_notifier *mn;
36 struct hlist_node *n;
37
38 /*
39 * RCU here will block mmu_notifier_unregister until
40 * ->release returns.
41 */
42 rcu_read_lock();
43 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
44 /*
45 * if ->release runs before mmu_notifier_unregister it
46 * must be handled as it's the only way for the driver
47 * to flush all existing sptes and stop the driver
48 * from establishing any more sptes before all the
49 * pages in the mm are freed.
50 */
51 if (mn->ops->release)
52 mn->ops->release(mn, mm);
53 rcu_read_unlock();
36 54
37 spin_lock(&mm->mmu_notifier_mm->lock); 55 spin_lock(&mm->mmu_notifier_mm->lock);
38 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 56 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
46 * mmu_notifier_unregister to return. 64 * mmu_notifier_unregister to return.
47 */ 65 */
48 hlist_del_init_rcu(&mn->hlist); 66 hlist_del_init_rcu(&mn->hlist);
49 /*
50 * RCU here will block mmu_notifier_unregister until
51 * ->release returns.
52 */
53 rcu_read_lock();
54 spin_unlock(&mm->mmu_notifier_mm->lock);
55 /*
56 * if ->release runs before mmu_notifier_unregister it
57 * must be handled as it's the only way for the driver
58 * to flush all existing sptes and stop the driver
59 * from establishing any more sptes before all the
60 * pages in the mm are freed.
61 */
62 if (mn->ops->release)
63 mn->ops->release(mn, mm);
64 rcu_read_unlock();
65 spin_lock(&mm->mmu_notifier_mm->lock);
66 } 67 }
67 spin_unlock(&mm->mmu_notifier_mm->lock); 68 spin_unlock(&mm->mmu_notifier_mm->lock);
68 69
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
284{ 285{
285 BUG_ON(atomic_read(&mm->mm_count) <= 0); 286 BUG_ON(atomic_read(&mm->mm_count) <= 0);
286 287
287 spin_lock(&mm->mmu_notifier_mm->lock);
288 if (!hlist_unhashed(&mn->hlist)) { 288 if (!hlist_unhashed(&mn->hlist)) {
289 hlist_del_rcu(&mn->hlist);
290
291 /* 289 /*
292 * RCU here will force exit_mmap to wait ->release to finish 290 * RCU here will force exit_mmap to wait ->release to finish
293 * before freeing the pages. 291 * before freeing the pages.
294 */ 292 */
295 rcu_read_lock(); 293 rcu_read_lock();
296 spin_unlock(&mm->mmu_notifier_mm->lock); 294
297 /* 295 /*
298 * exit_mmap will block in mmu_notifier_release to 296 * exit_mmap will block in mmu_notifier_release to
299 * guarantee ->release is called before freeing the 297 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
302 if (mn->ops->release) 300 if (mn->ops->release)
303 mn->ops->release(mn, mm); 301 mn->ops->release(mn, mm);
304 rcu_read_unlock(); 302 rcu_read_unlock();
305 } else 303
304 spin_lock(&mm->mmu_notifier_mm->lock);
305 hlist_del_rcu(&mn->hlist);
306 spin_unlock(&mm->mmu_notifier_mm->lock); 306 spin_unlock(&mm->mmu_notifier_mm->lock);
307 }
307 308
308 /* 309 /*
309 * Wait any running method to finish, of course including 310 * Wait any running method to finish, of course including
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 6830eab5bf09..3cef80f6ac79 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -96,7 +96,7 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone)
96 for_each_lru(lru) 96 for_each_lru(lru)
97 INIT_LIST_HEAD(&lruvec->lists[lru]); 97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98 98
99#ifdef CONFIG_CGROUP_MEM_RES_CTLR 99#ifdef CONFIG_MEMCG
100 lruvec->zone = zone; 100 lruvec->zone = zone;
101#endif 101#endif
102} 102}
diff --git a/mm/mremap.c b/mm/mremap.c
index 21fed202ddad..cc06d0e48d05 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
260 * If this were a serious issue, we'd add a flag to do_munmap(). 260 * If this were a serious issue, we'd add a flag to do_munmap().
261 */ 261 */
262 hiwater_vm = mm->hiwater_vm; 262 hiwater_vm = mm->hiwater_vm;
263 mm->total_vm += new_len >> PAGE_SHIFT;
264 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); 263 vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
265 264
266 if (do_munmap(mm, old_addr, old_len) < 0) { 265 if (do_munmap(mm, old_addr, old_len) < 0) {
@@ -497,7 +496,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
497 goto out; 496 goto out;
498 } 497 }
499 498
500 mm->total_vm += pages;
501 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 499 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
502 if (vma->vm_flags & VM_LOCKED) { 500 if (vma->vm_flags & VM_LOCKED) {
503 mm->locked_vm += pages; 501 mm->locked_vm += pages;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ac300c99baf6..198600861638 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -288,76 +288,93 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
288} 288}
289#endif 289#endif
290 290
291enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
292 unsigned long totalpages, const nodemask_t *nodemask,
293 bool force_kill)
294{
295 if (task->exit_state)
296 return OOM_SCAN_CONTINUE;
297 if (oom_unkillable_task(task, NULL, nodemask))
298 return OOM_SCAN_CONTINUE;
299
300 /*
301 * This task already has access to memory reserves and is being killed.
302 * Don't allow any other task to have access to the reserves.
303 */
304 if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
305 if (unlikely(frozen(task)))
306 __thaw_task(task);
307 if (!force_kill)
308 return OOM_SCAN_ABORT;
309 }
310 if (!task->mm)
311 return OOM_SCAN_CONTINUE;
312
313 if (task->flags & PF_EXITING) {
314 /*
315 * If task is current and is in the process of releasing memory,
316 * allow the "kill" to set TIF_MEMDIE, which will allow it to
317 * access memory reserves. Otherwise, it may stall forever.
318 *
319 * The iteration isn't broken here, however, in case other
320 * threads are found to have already been oom killed.
321 */
322 if (task == current)
323 return OOM_SCAN_SELECT;
324 else if (!force_kill) {
325 /*
326 * If this task is not being ptraced on exit, then wait
327 * for it to finish before killing some other task
328 * unnecessarily.
329 */
330 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
331 return OOM_SCAN_ABORT;
332 }
333 }
334 return OOM_SCAN_OK;
335}
336
291/* 337/*
292 * Simple selection loop. We chose the process with the highest 338 * Simple selection loop. We chose the process with the highest
293 * number of 'points'. We expect the caller will lock the tasklist. 339 * number of 'points'.
294 * 340 *
295 * (not docbooked, we don't want this one cluttering up the manual) 341 * (not docbooked, we don't want this one cluttering up the manual)
296 */ 342 */
297static struct task_struct *select_bad_process(unsigned int *ppoints, 343static struct task_struct *select_bad_process(unsigned int *ppoints,
298 unsigned long totalpages, struct mem_cgroup *memcg, 344 unsigned long totalpages, const nodemask_t *nodemask,
299 const nodemask_t *nodemask, bool force_kill) 345 bool force_kill)
300{ 346{
301 struct task_struct *g, *p; 347 struct task_struct *g, *p;
302 struct task_struct *chosen = NULL; 348 struct task_struct *chosen = NULL;
303 unsigned long chosen_points = 0; 349 unsigned long chosen_points = 0;
304 350
351 rcu_read_lock();
305 do_each_thread(g, p) { 352 do_each_thread(g, p) {
306 unsigned int points; 353 unsigned int points;
307 354
308 if (p->exit_state) 355 switch (oom_scan_process_thread(p, totalpages, nodemask,
309 continue; 356 force_kill)) {
310 if (oom_unkillable_task(p, memcg, nodemask)) 357 case OOM_SCAN_SELECT:
311 continue; 358 chosen = p;
312 359 chosen_points = ULONG_MAX;
313 /* 360 /* fall through */
314 * This task already has access to memory reserves and is 361 case OOM_SCAN_CONTINUE:
315 * being killed. Don't allow any other task access to the
316 * memory reserve.
317 *
318 * Note: this may have a chance of deadlock if it gets
319 * blocked waiting for another task which itself is waiting
320 * for memory. Is there a better alternative?
321 */
322 if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
323 if (unlikely(frozen(p)))
324 __thaw_task(p);
325 if (!force_kill)
326 return ERR_PTR(-1UL);
327 }
328 if (!p->mm)
329 continue; 362 continue;
330 363 case OOM_SCAN_ABORT:
331 if (p->flags & PF_EXITING) { 364 rcu_read_unlock();
332 /* 365 return ERR_PTR(-1UL);
333 * If p is the current task and is in the process of 366 case OOM_SCAN_OK:
334 * releasing memory, we allow the "kill" to set 367 break;
335 * TIF_MEMDIE, which will allow it to gain access to 368 };
336 * memory reserves. Otherwise, it may stall forever. 369 points = oom_badness(p, NULL, nodemask, totalpages);
337 *
338 * The loop isn't broken here, however, in case other
339 * threads are found to have already been oom killed.
340 */
341 if (p == current) {
342 chosen = p;
343 chosen_points = ULONG_MAX;
344 } else if (!force_kill) {
345 /*
346 * If this task is not being ptraced on exit,
347 * then wait for it to finish before killing
348 * some other task unnecessarily.
349 */
350 if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
351 return ERR_PTR(-1UL);
352 }
353 }
354
355 points = oom_badness(p, memcg, nodemask, totalpages);
356 if (points > chosen_points) { 370 if (points > chosen_points) {
357 chosen = p; 371 chosen = p;
358 chosen_points = points; 372 chosen_points = points;
359 } 373 }
360 } while_each_thread(g, p); 374 } while_each_thread(g, p);
375 if (chosen)
376 get_task_struct(chosen);
377 rcu_read_unlock();
361 378
362 *ppoints = chosen_points * 1000 / totalpages; 379 *ppoints = chosen_points * 1000 / totalpages;
363 return chosen; 380 return chosen;
@@ -371,17 +388,16 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
371 * Dumps the current memory state of all eligible tasks. Tasks not in the same 388 * Dumps the current memory state of all eligible tasks. Tasks not in the same
372 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes 389 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
373 * are not shown. 390 * are not shown.
374 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 391 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
375 * value, oom_score_adj value, and name. 392 * swapents, oom_score_adj value, and name.
376 *
377 * Call with tasklist_lock read-locked.
378 */ 393 */
379static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) 394static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
380{ 395{
381 struct task_struct *p; 396 struct task_struct *p;
382 struct task_struct *task; 397 struct task_struct *task;
383 398
384 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); 399 pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
400 rcu_read_lock();
385 for_each_process(p) { 401 for_each_process(p) {
386 if (oom_unkillable_task(p, memcg, nodemask)) 402 if (oom_unkillable_task(p, memcg, nodemask))
387 continue; 403 continue;
@@ -396,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
396 continue; 412 continue;
397 } 413 }
398 414
399 pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", 415 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
400 task->pid, from_kuid(&init_user_ns, task_uid(task)), 416 task->pid, from_kuid(&init_user_ns, task_uid(task)),
401 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 417 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
402 task_cpu(task), task->signal->oom_adj, 418 task->mm->nr_ptes,
419 get_mm_counter(task->mm, MM_SWAPENTS),
403 task->signal->oom_score_adj, task->comm); 420 task->signal->oom_score_adj, task->comm);
404 task_unlock(task); 421 task_unlock(task);
405 } 422 }
423 rcu_read_unlock();
406} 424}
407 425
408static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, 426static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
@@ -423,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
423} 441}
424 442
425#define K(x) ((x) << (PAGE_SHIFT-10)) 443#define K(x) ((x) << (PAGE_SHIFT-10))
426static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 444/*
427 unsigned int points, unsigned long totalpages, 445 * Must be called while holding a reference to p, which will be released upon
428 struct mem_cgroup *memcg, nodemask_t *nodemask, 446 * returning.
429 const char *message) 447 */
448void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
449 unsigned int points, unsigned long totalpages,
450 struct mem_cgroup *memcg, nodemask_t *nodemask,
451 const char *message)
430{ 452{
431 struct task_struct *victim = p; 453 struct task_struct *victim = p;
432 struct task_struct *child; 454 struct task_struct *child;
@@ -442,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
442 */ 464 */
443 if (p->flags & PF_EXITING) { 465 if (p->flags & PF_EXITING) {
444 set_tsk_thread_flag(p, TIF_MEMDIE); 466 set_tsk_thread_flag(p, TIF_MEMDIE);
467 put_task_struct(p);
445 return; 468 return;
446 } 469 }
447 470
@@ -459,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
459 * parent. This attempts to lose the minimal amount of work done while 482 * parent. This attempts to lose the minimal amount of work done while
460 * still freeing memory. 483 * still freeing memory.
461 */ 484 */
485 read_lock(&tasklist_lock);
462 do { 486 do {
463 list_for_each_entry(child, &t->children, sibling) { 487 list_for_each_entry(child, &t->children, sibling) {
464 unsigned int child_points; 488 unsigned int child_points;
@@ -471,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
471 child_points = oom_badness(child, memcg, nodemask, 495 child_points = oom_badness(child, memcg, nodemask,
472 totalpages); 496 totalpages);
473 if (child_points > victim_points) { 497 if (child_points > victim_points) {
498 put_task_struct(victim);
474 victim = child; 499 victim = child;
475 victim_points = child_points; 500 victim_points = child_points;
501 get_task_struct(victim);
476 } 502 }
477 } 503 }
478 } while_each_thread(p, t); 504 } while_each_thread(p, t);
505 read_unlock(&tasklist_lock);
479 506
480 victim = find_lock_task_mm(victim); 507 rcu_read_lock();
481 if (!victim) 508 p = find_lock_task_mm(victim);
509 if (!p) {
510 rcu_read_unlock();
511 put_task_struct(victim);
482 return; 512 return;
513 } else if (victim != p) {
514 get_task_struct(p);
515 put_task_struct(victim);
516 victim = p;
517 }
483 518
484 /* mm cannot safely be dereferenced after task_unlock(victim) */ 519 /* mm cannot safely be dereferenced after task_unlock(victim) */
485 mm = victim->mm; 520 mm = victim->mm;
@@ -510,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
510 task_unlock(p); 545 task_unlock(p);
511 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); 546 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
512 } 547 }
548 rcu_read_unlock();
513 549
514 set_tsk_thread_flag(victim, TIF_MEMDIE); 550 set_tsk_thread_flag(victim, TIF_MEMDIE);
515 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); 551 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
552 put_task_struct(victim);
516} 553}
517#undef K 554#undef K
518 555
519/* 556/*
520 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 557 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
521 */ 558 */
522static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, 559void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
523 int order, const nodemask_t *nodemask) 560 int order, const nodemask_t *nodemask)
524{ 561{
525 if (likely(!sysctl_panic_on_oom)) 562 if (likely(!sysctl_panic_on_oom))
526 return; 563 return;
@@ -533,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
533 if (constraint != CONSTRAINT_NONE) 570 if (constraint != CONSTRAINT_NONE)
534 return; 571 return;
535 } 572 }
536 read_lock(&tasklist_lock);
537 dump_header(NULL, gfp_mask, order, NULL, nodemask); 573 dump_header(NULL, gfp_mask, order, NULL, nodemask);
538 read_unlock(&tasklist_lock);
539 panic("Out of memory: %s panic_on_oom is enabled\n", 574 panic("Out of memory: %s panic_on_oom is enabled\n",
540 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 575 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
541} 576}
542 577
543#ifdef CONFIG_CGROUP_MEM_RES_CTLR
544void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
545 int order)
546{
547 unsigned long limit;
548 unsigned int points = 0;
549 struct task_struct *p;
550
551 /*
552 * If current has a pending SIGKILL, then automatically select it. The
553 * goal is to allow it to allocate so that it may quickly exit and free
554 * its memory.
555 */
556 if (fatal_signal_pending(current)) {
557 set_thread_flag(TIF_MEMDIE);
558 return;
559 }
560
561 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
562 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
563 read_lock(&tasklist_lock);
564 p = select_bad_process(&points, limit, memcg, NULL, false);
565 if (p && PTR_ERR(p) != -1UL)
566 oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
567 "Memory cgroup out of memory");
568 read_unlock(&tasklist_lock);
569}
570#endif
571
572static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 578static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
573 579
574int register_oom_notifier(struct notifier_block *nb) 580int register_oom_notifier(struct notifier_block *nb)
@@ -690,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
690 struct task_struct *p; 696 struct task_struct *p;
691 unsigned long totalpages; 697 unsigned long totalpages;
692 unsigned long freed = 0; 698 unsigned long freed = 0;
693 unsigned int points; 699 unsigned int uninitialized_var(points);
694 enum oom_constraint constraint = CONSTRAINT_NONE; 700 enum oom_constraint constraint = CONSTRAINT_NONE;
695 int killed = 0; 701 int killed = 0;
696 702
@@ -718,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
718 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; 724 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
719 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); 725 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
720 726
721 read_lock(&tasklist_lock); 727 if (sysctl_oom_kill_allocating_task && current->mm &&
722 if (sysctl_oom_kill_allocating_task &&
723 !oom_unkillable_task(current, NULL, nodemask) && 728 !oom_unkillable_task(current, NULL, nodemask) &&
724 current->mm) { 729 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
730 get_task_struct(current);
725 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, 731 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
726 nodemask, 732 nodemask,
727 "Out of memory (oom_kill_allocating_task)"); 733 "Out of memory (oom_kill_allocating_task)");
728 goto out; 734 goto out;
729 } 735 }
730 736
731 p = select_bad_process(&points, totalpages, NULL, mpol_mask, 737 p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
732 force_kill);
733 /* Found nothing?!?! Either we hang forever, or we panic. */ 738 /* Found nothing?!?! Either we hang forever, or we panic. */
734 if (!p) { 739 if (!p) {
735 dump_header(NULL, gfp_mask, order, NULL, mpol_mask); 740 dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
736 read_unlock(&tasklist_lock);
737 panic("Out of memory and no killable processes...\n"); 741 panic("Out of memory and no killable processes...\n");
738 } 742 }
739 if (PTR_ERR(p) != -1UL) { 743 if (PTR_ERR(p) != -1UL) {
@@ -742,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
742 killed = 1; 746 killed = 1;
743 } 747 }
744out: 748out:
745 read_unlock(&tasklist_lock);
746
747 /* 749 /*
748 * Give "p" a good chance of killing itself before we 750 * Give the killed threads a good chance of exiting before trying to
749 * retry to allocate memory unless "p" is current 751 * allocate memory again.
750 */ 752 */
751 if (killed && !test_thread_flag(TIF_MEMDIE)) 753 if (killed)
752 schedule_timeout_uninterruptible(1); 754 schedule_timeout_killable(1);
753} 755}
754 756
755/* 757/*
@@ -764,6 +766,5 @@ void pagefault_out_of_memory(void)
764 out_of_memory(NULL, 0, 0, NULL, false); 766 out_of_memory(NULL, 0, 0, NULL, false);
765 clear_system_oom(); 767 clear_system_oom();
766 } 768 }
767 if (!test_thread_flag(TIF_MEMDIE)) 769 schedule_timeout_killable(1);
768 schedule_timeout_uninterruptible(1);
769} 770}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4a4f9219683f..889532b8e6c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@
51#include <linux/page_cgroup.h> 51#include <linux/page_cgroup.h>
52#include <linux/debugobjects.h> 52#include <linux/debugobjects.h>
53#include <linux/kmemleak.h> 53#include <linux/kmemleak.h>
54#include <linux/memory.h>
55#include <linux/compaction.h> 54#include <linux/compaction.h>
56#include <trace/events/kmem.h> 55#include <trace/events/kmem.h>
57#include <linux/ftrace_event.h> 56#include <linux/ftrace_event.h>
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
219 218
220int page_group_by_mobility_disabled __read_mostly; 219int page_group_by_mobility_disabled __read_mostly;
221 220
222static void set_pageblock_migratetype(struct page *page, int migratetype) 221/*
222 * NOTE:
223 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
224 * Instead, use {un}set_pageblock_isolate.
225 */
226void set_pageblock_migratetype(struct page *page, int migratetype)
223{ 227{
224 228
225 if (unlikely(page_group_by_mobility_disabled)) 229 if (unlikely(page_group_by_mobility_disabled))
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone,
954 return pages_moved; 958 return pages_moved;
955} 959}
956 960
957static int move_freepages_block(struct zone *zone, struct page *page, 961int move_freepages_block(struct zone *zone, struct page *page,
958 int migratetype) 962 int migratetype)
959{ 963{
960 unsigned long start_pfn, end_pfn; 964 unsigned long start_pfn, end_pfn;
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1158 to_drain = pcp->batch; 1162 to_drain = pcp->batch;
1159 else 1163 else
1160 to_drain = pcp->count; 1164 to_drain = pcp->count;
1161 free_pcppages_bulk(zone, to_drain, pcp); 1165 if (to_drain > 0) {
1162 pcp->count -= to_drain; 1166 free_pcppages_bulk(zone, to_drain, pcp);
1167 pcp->count -= to_drain;
1168 }
1163 local_irq_restore(flags); 1169 local_irq_restore(flags);
1164} 1170}
1165#endif 1171#endif
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)
1529} 1535}
1530__setup("fail_page_alloc=", setup_fail_page_alloc); 1536__setup("fail_page_alloc=", setup_fail_page_alloc);
1531 1537
1532static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1538static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1533{ 1539{
1534 if (order < fail_page_alloc.min_order) 1540 if (order < fail_page_alloc.min_order)
1535 return 0; 1541 return false;
1536 if (gfp_mask & __GFP_NOFAIL) 1542 if (gfp_mask & __GFP_NOFAIL)
1537 return 0; 1543 return false;
1538 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1544 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1539 return 0; 1545 return false;
1540 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1546 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1541 return 0; 1547 return false;
1542 1548
1543 return should_fail(&fail_page_alloc.attr, 1 << order); 1549 return should_fail(&fail_page_alloc.attr, 1 << order);
1544} 1550}
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);
1578 1584
1579#else /* CONFIG_FAIL_PAGE_ALLOC */ 1585#else /* CONFIG_FAIL_PAGE_ALLOC */
1580 1586
1581static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1587static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1582{ 1588{
1583 return 0; 1589 return false;
1584} 1590}
1585 1591
1586#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1592#endif /* CONFIG_FAIL_PAGE_ALLOC */
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1594{ 1600{
1595 /* free_pages my go negative - that's OK */ 1601 /* free_pages my go negative - that's OK */
1596 long min = mark; 1602 long min = mark;
1603 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1597 int o; 1604 int o;
1598 1605
1599 free_pages -= (1 << order) - 1; 1606 free_pages -= (1 << order) - 1;
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1602 if (alloc_flags & ALLOC_HARDER) 1609 if (alloc_flags & ALLOC_HARDER)
1603 min -= min / 4; 1610 min -= min / 4;
1604 1611
1605 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1612 if (free_pages <= min + lowmem_reserve)
1606 return false; 1613 return false;
1607 for (o = 0; o < order; o++) { 1614 for (o = 0; o < order; o++) {
1608 /* At the next order, this order's pages become unavailable */ 1615 /* At the next order, this order's pages become unavailable */
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1617 return true; 1624 return true;
1618} 1625}
1619 1626
1627#ifdef CONFIG_MEMORY_ISOLATION
1628static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1629{
1630 if (unlikely(zone->nr_pageblock_isolate))
1631 return zone->nr_pageblock_isolate * pageblock_nr_pages;
1632 return 0;
1633}
1634#else
1635static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1636{
1637 return 0;
1638}
1639#endif
1640
1620bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1641bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1621 int classzone_idx, int alloc_flags) 1642 int classzone_idx, int alloc_flags)
1622{ 1643{
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1632 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1653 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1633 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1654 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1634 1655
1656 /*
1657 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
1658 * it. nr_zone_isolate_freepages is never accurate so kswapd might not
1659 * sleep although it could do so. But this is more desirable for memory
1660 * hotplug than sleeping which can cause a livelock in the direct
1661 * reclaim path.
1662 */
1663 free_pages -= nr_zone_isolate_freepages(z);
1635 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1664 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1636 free_pages); 1665 free_pages);
1637} 1666}
@@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2087 2116
2088 page = get_page_from_freelist(gfp_mask, nodemask, 2117 page = get_page_from_freelist(gfp_mask, nodemask,
2089 order, zonelist, high_zoneidx, 2118 order, zonelist, high_zoneidx,
2090 alloc_flags, preferred_zone, 2119 alloc_flags & ~ALLOC_NO_WATERMARKS,
2091 migratetype); 2120 preferred_zone, migratetype);
2092 if (page) { 2121 if (page) {
2093 preferred_zone->compact_considered = 0; 2122 preferred_zone->compact_considered = 0;
2094 preferred_zone->compact_defer_shift = 0; 2123 preferred_zone->compact_defer_shift = 0;
@@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2180retry: 2209retry:
2181 page = get_page_from_freelist(gfp_mask, nodemask, order, 2210 page = get_page_from_freelist(gfp_mask, nodemask, order,
2182 zonelist, high_zoneidx, 2211 zonelist, high_zoneidx,
2183 alloc_flags, preferred_zone, 2212 alloc_flags & ~ALLOC_NO_WATERMARKS,
2184 migratetype); 2213 preferred_zone, migratetype);
2185 2214
2186 /* 2215 /*
2187 * If an allocation failed after direct reclaim, it could be because 2216 * If an allocation failed after direct reclaim, it could be because
@@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2265 alloc_flags |= ALLOC_HARDER; 2294 alloc_flags |= ALLOC_HARDER;
2266 2295
2267 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2296 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2268 if (!in_interrupt() && 2297 if (gfp_mask & __GFP_MEMALLOC)
2269 ((current->flags & PF_MEMALLOC) || 2298 alloc_flags |= ALLOC_NO_WATERMARKS;
2270 unlikely(test_thread_flag(TIF_MEMDIE)))) 2299 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2300 alloc_flags |= ALLOC_NO_WATERMARKS;
2301 else if (!in_interrupt() &&
2302 ((current->flags & PF_MEMALLOC) ||
2303 unlikely(test_thread_flag(TIF_MEMDIE))))
2271 alloc_flags |= ALLOC_NO_WATERMARKS; 2304 alloc_flags |= ALLOC_NO_WATERMARKS;
2272 } 2305 }
2273 2306
2274 return alloc_flags; 2307 return alloc_flags;
2275} 2308}
2276 2309
2310bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2311{
2312 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2313}
2314
2277static inline struct page * 2315static inline struct page *
2278__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2316__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2279 struct zonelist *zonelist, enum zone_type high_zoneidx, 2317 struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2340,11 +2378,27 @@ rebalance:
2340 2378
2341 /* Allocate without watermarks if the context allows */ 2379 /* Allocate without watermarks if the context allows */
2342 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2380 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2381 /*
2382 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2383 * the allocation is high priority and these type of
2384 * allocations are system rather than user orientated
2385 */
2386 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2387
2343 page = __alloc_pages_high_priority(gfp_mask, order, 2388 page = __alloc_pages_high_priority(gfp_mask, order,
2344 zonelist, high_zoneidx, nodemask, 2389 zonelist, high_zoneidx, nodemask,
2345 preferred_zone, migratetype); 2390 preferred_zone, migratetype);
2346 if (page) 2391 if (page) {
2392 /*
2393 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2394 * necessary to allocate the page. The expectation is
2395 * that the caller is taking steps that will free more
2396 * memory. The caller should avoid the page being used
2397 * for !PFMEMALLOC purposes.
2398 */
2399 page->pfmemalloc = true;
2347 goto got_pg; 2400 goto got_pg;
2401 }
2348 } 2402 }
2349 2403
2350 /* Atomic allocations - we can't balance anything */ 2404 /* Atomic allocations - we can't balance anything */
@@ -2463,8 +2517,8 @@ nopage:
2463got_pg: 2517got_pg:
2464 if (kmemcheck_enabled) 2518 if (kmemcheck_enabled)
2465 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2519 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2466 return page;
2467 2520
2521 return page;
2468} 2522}
2469 2523
2470/* 2524/*
@@ -2515,6 +2569,8 @@ retry_cpuset:
2515 page = __alloc_pages_slowpath(gfp_mask, order, 2569 page = __alloc_pages_slowpath(gfp_mask, order,
2516 zonelist, high_zoneidx, nodemask, 2570 zonelist, high_zoneidx, nodemask,
2517 preferred_zone, migratetype); 2571 preferred_zone, migratetype);
2572 else
2573 page->pfmemalloc = false;
2518 2574
2519 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2575 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2520 2576
@@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
3030 user_zonelist_order = oldval; 3086 user_zonelist_order = oldval;
3031 } else if (oldval != user_zonelist_order) { 3087 } else if (oldval != user_zonelist_order) {
3032 mutex_lock(&zonelists_mutex); 3088 mutex_lock(&zonelists_mutex);
3033 build_all_zonelists(NULL); 3089 build_all_zonelists(NULL, NULL);
3034 mutex_unlock(&zonelists_mutex); 3090 mutex_unlock(&zonelists_mutex);
3035 } 3091 }
3036 } 3092 }
@@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone);
3409DEFINE_MUTEX(zonelists_mutex); 3465DEFINE_MUTEX(zonelists_mutex);
3410 3466
3411/* return values int ....just for stop_machine() */ 3467/* return values int ....just for stop_machine() */
3412static __init_refok int __build_all_zonelists(void *data) 3468static int __build_all_zonelists(void *data)
3413{ 3469{
3414 int nid; 3470 int nid;
3415 int cpu; 3471 int cpu;
3472 pg_data_t *self = data;
3416 3473
3417#ifdef CONFIG_NUMA 3474#ifdef CONFIG_NUMA
3418 memset(node_load, 0, sizeof(node_load)); 3475 memset(node_load, 0, sizeof(node_load));
3419#endif 3476#endif
3477
3478 if (self && !node_online(self->node_id)) {
3479 build_zonelists(self);
3480 build_zonelist_cache(self);
3481 }
3482
3420 for_each_online_node(nid) { 3483 for_each_online_node(nid) {
3421 pg_data_t *pgdat = NODE_DATA(nid); 3484 pg_data_t *pgdat = NODE_DATA(nid);
3422 3485
@@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data)
3461 * Called with zonelists_mutex held always 3524 * Called with zonelists_mutex held always
3462 * unless system_state == SYSTEM_BOOTING. 3525 * unless system_state == SYSTEM_BOOTING.
3463 */ 3526 */
3464void __ref build_all_zonelists(void *data) 3527void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3465{ 3528{
3466 set_zonelist_order(); 3529 set_zonelist_order();
3467 3530
@@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data)
3473 /* we have to stop all cpus to guarantee there is no user 3536 /* we have to stop all cpus to guarantee there is no user
3474 of zonelist */ 3537 of zonelist */
3475#ifdef CONFIG_MEMORY_HOTPLUG 3538#ifdef CONFIG_MEMORY_HOTPLUG
3476 if (data) 3539 if (zone)
3477 setup_zone_pageset((struct zone *)data); 3540 setup_zone_pageset(zone);
3478#endif 3541#endif
3479 stop_machine(__build_all_zonelists, NULL, NULL); 3542 stop_machine(__build_all_zonelists, pgdat, NULL);
3480 /* cpuset refresh routine should be here */ 3543 /* cpuset refresh routine should be here */
3481 } 3544 }
3482 vm_total_pages = nr_free_pagecache_pages(); 3545 vm_total_pages = nr_free_pagecache_pages();
@@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
3746 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 3809 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3747#endif 3810#endif
3748 3811
3749static int zone_batchsize(struct zone *zone) 3812static int __meminit zone_batchsize(struct zone *zone)
3750{ 3813{
3751#ifdef CONFIG_MMU 3814#ifdef CONFIG_MMU
3752 int batch; 3815 int batch;
@@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3828 pcp->batch = PAGE_SHIFT * 8; 3891 pcp->batch = PAGE_SHIFT * 8;
3829} 3892}
3830 3893
3831static void setup_zone_pageset(struct zone *zone) 3894static void __meminit setup_zone_pageset(struct zone *zone)
3832{ 3895{
3833 int cpu; 3896 int cpu;
3834 3897
@@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3901 return 0; 3964 return 0;
3902} 3965}
3903 3966
3904static int __zone_pcp_update(void *data)
3905{
3906 struct zone *zone = data;
3907 int cpu;
3908 unsigned long batch = zone_batchsize(zone), flags;
3909
3910 for_each_possible_cpu(cpu) {
3911 struct per_cpu_pageset *pset;
3912 struct per_cpu_pages *pcp;
3913
3914 pset = per_cpu_ptr(zone->pageset, cpu);
3915 pcp = &pset->pcp;
3916
3917 local_irq_save(flags);
3918 free_pcppages_bulk(zone, pcp->count, pcp);
3919 setup_pageset(pset, batch);
3920 local_irq_restore(flags);
3921 }
3922 return 0;
3923}
3924
3925void zone_pcp_update(struct zone *zone)
3926{
3927 stop_machine(__zone_pcp_update, zone, NULL);
3928}
3929
3930static __meminit void zone_pcp_init(struct zone *zone) 3967static __meminit void zone_pcp_init(struct zone *zone)
3931{ 3968{
3932 /* 3969 /*
@@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
3942 zone_batchsize(zone)); 3979 zone_batchsize(zone));
3943} 3980}
3944 3981
3945__meminit int init_currently_empty_zone(struct zone *zone, 3982int __meminit init_currently_empty_zone(struct zone *zone,
3946 unsigned long zone_start_pfn, 3983 unsigned long zone_start_pfn,
3947 unsigned long size, 3984 unsigned long size,
3948 enum memmap_context context) 3985 enum memmap_context context)
@@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,
4301#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4338#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4302 4339
4303/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4340/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4304static inline void __init set_pageblock_order(void) 4341void __init set_pageblock_order(void)
4305{ 4342{
4306 unsigned int order; 4343 unsigned int order;
4307 4344
@@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void)
4329 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4366 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4330 * the kernel config 4367 * the kernel config
4331 */ 4368 */
4332static inline void set_pageblock_order(void) 4369void __init set_pageblock_order(void)
4333{ 4370{
4334} 4371}
4335 4372
@@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void)
4340 * - mark all pages reserved 4377 * - mark all pages reserved
4341 * - mark all memory queues empty 4378 * - mark all memory queues empty
4342 * - clear the memory bitmaps 4379 * - clear the memory bitmaps
4380 *
4381 * NOTE: pgdat should get zeroed by caller.
4343 */ 4382 */
4344static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4383static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4345 unsigned long *zones_size, unsigned long *zholes_size) 4384 unsigned long *zones_size, unsigned long *zholes_size)
@@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4350 int ret; 4389 int ret;
4351 4390
4352 pgdat_resize_init(pgdat); 4391 pgdat_resize_init(pgdat);
4353 pgdat->nr_zones = 0;
4354 init_waitqueue_head(&pgdat->kswapd_wait); 4392 init_waitqueue_head(&pgdat->kswapd_wait);
4355 pgdat->kswapd_max_order = 0; 4393 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4356 pgdat_page_cgroup_init(pgdat); 4394 pgdat_page_cgroup_init(pgdat);
4357 4395
4358 for (j = 0; j < MAX_NR_ZONES; j++) { 4396 for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4394 4432
4395 zone->spanned_pages = size; 4433 zone->spanned_pages = size;
4396 zone->present_pages = realsize; 4434 zone->present_pages = realsize;
4435#if defined CONFIG_COMPACTION || defined CONFIG_CMA
4436 zone->compact_cached_free_pfn = zone->zone_start_pfn +
4437 zone->spanned_pages;
4438 zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
4439#endif
4397#ifdef CONFIG_NUMA 4440#ifdef CONFIG_NUMA
4398 zone->node = nid; 4441 zone->node = nid;
4399 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4442 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4408 4451
4409 zone_pcp_init(zone); 4452 zone_pcp_init(zone);
4410 lruvec_init(&zone->lruvec, zone); 4453 lruvec_init(&zone->lruvec, zone);
4411 zap_zone_vm_stats(zone);
4412 zone->flags = 0;
4413 if (!size) 4454 if (!size)
4414 continue; 4455 continue;
4415 4456
@@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4469{ 4510{
4470 pg_data_t *pgdat = NODE_DATA(nid); 4511 pg_data_t *pgdat = NODE_DATA(nid);
4471 4512
4513 /* pg_data_t should be reset to zero when it's allocated */
4514 WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx);
4515
4472 pgdat->node_id = nid; 4516 pgdat->node_id = nid;
4473 pgdat->node_start_pfn = node_start_pfn; 4517 pgdat->node_start_pfn = node_start_pfn;
4474 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4518 calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -4750,7 +4794,7 @@ out:
4750} 4794}
4751 4795
4752/* Any regular memory on that node ? */ 4796/* Any regular memory on that node ? */
4753static void check_for_regular_memory(pg_data_t *pgdat) 4797static void __init check_for_regular_memory(pg_data_t *pgdat)
4754{ 4798{
4755#ifdef CONFIG_HIGHMEM 4799#ifdef CONFIG_HIGHMEM
4756 enum zone_type zone_type; 4800 enum zone_type zone_type;
@@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5468} 5512}
5469 5513
5470/* 5514/*
5471 * This is designed as sub function...plz see page_isolation.c also. 5515 * This function checks whether pageblock includes unmovable pages or not.
5472 * set/clear page block's type to be ISOLATE. 5516 * If @count is not zero, it is okay to include less @count unmovable pages
5473 * page allocater never alloc memory from ISOLATE block. 5517 *
5518 * PageLRU check wihtout isolation or lru_lock could race so that
5519 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5520 * expect this function should be exact.
5474 */ 5521 */
5475 5522bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5476static int
5477__count_immobile_pages(struct zone *zone, struct page *page, int count)
5478{ 5523{
5479 unsigned long pfn, iter, found; 5524 unsigned long pfn, iter, found;
5480 int mt; 5525 int mt;
5481 5526
5482 /* 5527 /*
5483 * For avoiding noise data, lru_add_drain_all() should be called 5528 * For avoiding noise data, lru_add_drain_all() should be called
5484 * If ZONE_MOVABLE, the zone never contains immobile pages 5529 * If ZONE_MOVABLE, the zone never contains unmovable pages
5485 */ 5530 */
5486 if (zone_idx(zone) == ZONE_MOVABLE) 5531 if (zone_idx(zone) == ZONE_MOVABLE)
5487 return true; 5532 return false;
5488 mt = get_pageblock_migratetype(page); 5533 mt = get_pageblock_migratetype(page);
5489 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 5534 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
5490 return true; 5535 return false;
5491 5536
5492 pfn = page_to_pfn(page); 5537 pfn = page_to_pfn(page);
5493 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 5538 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
@@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5497 continue; 5542 continue;
5498 5543
5499 page = pfn_to_page(check); 5544 page = pfn_to_page(check);
5500 if (!page_count(page)) { 5545 /*
5546 * We can't use page_count without pin a page
5547 * because another CPU can free compound page.
5548 * This check already skips compound tails of THP
5549 * because their page->_count is zero at all time.
5550 */
5551 if (!atomic_read(&page->_count)) {
5501 if (PageBuddy(page)) 5552 if (PageBuddy(page))
5502 iter += (1 << page_order(page)) - 1; 5553 iter += (1 << page_order(page)) - 1;
5503 continue; 5554 continue;
5504 } 5555 }
5556
5505 if (!PageLRU(page)) 5557 if (!PageLRU(page))
5506 found++; 5558 found++;
5507 /* 5559 /*
@@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5518 * page at boot. 5570 * page at boot.
5519 */ 5571 */
5520 if (found > count) 5572 if (found > count)
5521 return false; 5573 return true;
5522 } 5574 }
5523 return true; 5575 return false;
5524} 5576}
5525 5577
5526bool is_pageblock_removable_nolock(struct page *page) 5578bool is_pageblock_removable_nolock(struct page *page)
@@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5544 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5596 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5545 return false; 5597 return false;
5546 5598
5547 return __count_immobile_pages(zone, page, 0); 5599 return !has_unmovable_pages(zone, page, 0);
5548}
5549
5550int set_migratetype_isolate(struct page *page)
5551{
5552 struct zone *zone;
5553 unsigned long flags, pfn;
5554 struct memory_isolate_notify arg;
5555 int notifier_ret;
5556 int ret = -EBUSY;
5557
5558 zone = page_zone(page);
5559
5560 spin_lock_irqsave(&zone->lock, flags);
5561
5562 pfn = page_to_pfn(page);
5563 arg.start_pfn = pfn;
5564 arg.nr_pages = pageblock_nr_pages;
5565 arg.pages_found = 0;
5566
5567 /*
5568 * It may be possible to isolate a pageblock even if the
5569 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5570 * notifier chain is used by balloon drivers to return the
5571 * number of pages in a range that are held by the balloon
5572 * driver to shrink memory. If all the pages are accounted for
5573 * by balloons, are free, or on the LRU, isolation can continue.
5574 * Later, for example, when memory hotplug notifier runs, these
5575 * pages reported as "can be isolated" should be isolated(freed)
5576 * by the balloon driver through the memory notifier chain.
5577 */
5578 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5579 notifier_ret = notifier_to_errno(notifier_ret);
5580 if (notifier_ret)
5581 goto out;
5582 /*
5583 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
5584 * We just check MOVABLE pages.
5585 */
5586 if (__count_immobile_pages(zone, page, arg.pages_found))
5587 ret = 0;
5588
5589 /*
5590 * immobile means "not-on-lru" paes. If immobile is larger than
5591 * removable-by-driver pages reported by notifier, we'll fail.
5592 */
5593
5594out:
5595 if (!ret) {
5596 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5597 move_freepages_block(zone, page, MIGRATE_ISOLATE);
5598 }
5599
5600 spin_unlock_irqrestore(&zone->lock, flags);
5601 if (!ret)
5602 drain_all_pages();
5603 return ret;
5604}
5605
5606void unset_migratetype_isolate(struct page *page, unsigned migratetype)
5607{
5608 struct zone *zone;
5609 unsigned long flags;
5610 zone = page_zone(page);
5611 spin_lock_irqsave(&zone->lock, flags);
5612 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
5613 goto out;
5614 set_pageblock_migratetype(page, migratetype);
5615 move_freepages_block(zone, page, migratetype);
5616out:
5617 spin_unlock_irqrestore(&zone->lock, flags);
5618} 5600}
5619 5601
5620#ifdef CONFIG_CMA 5602#ifdef CONFIG_CMA
@@ -5869,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
5869} 5851}
5870#endif 5852#endif
5871 5853
5854#ifdef CONFIG_MEMORY_HOTPLUG
5855static int __meminit __zone_pcp_update(void *data)
5856{
5857 struct zone *zone = data;
5858 int cpu;
5859 unsigned long batch = zone_batchsize(zone), flags;
5860
5861 for_each_possible_cpu(cpu) {
5862 struct per_cpu_pageset *pset;
5863 struct per_cpu_pages *pcp;
5864
5865 pset = per_cpu_ptr(zone->pageset, cpu);
5866 pcp = &pset->pcp;
5867
5868 local_irq_save(flags);
5869 if (pcp->count > 0)
5870 free_pcppages_bulk(zone, pcp->count, pcp);
5871 setup_pageset(pset, batch);
5872 local_irq_restore(flags);
5873 }
5874 return 0;
5875}
5876
5877void __meminit zone_pcp_update(struct zone *zone)
5878{
5879 stop_machine(__zone_pcp_update, zone, NULL);
5880}
5881#endif
5882
5872#ifdef CONFIG_MEMORY_HOTREMOVE 5883#ifdef CONFIG_MEMORY_HOTREMOVE
5884void zone_pcp_reset(struct zone *zone)
5885{
5886 unsigned long flags;
5887
5888 /* avoid races with drain_pages() */
5889 local_irq_save(flags);
5890 if (zone->pageset != &boot_pageset) {
5891 free_percpu(zone->pageset);
5892 zone->pageset = &boot_pageset;
5893 }
5894 local_irq_restore(flags);
5895}
5896
5873/* 5897/*
5874 * All pages in the range must be isolated before calling this. 5898 * All pages in the range must be isolated before calling this.
5875 */ 5899 */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index eb750f851395..5ddad0c6daa6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
317#endif 317#endif
318 318
319 319
320#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 320#ifdef CONFIG_MEMCG_SWAP
321 321
322static DEFINE_MUTEX(swap_cgroup_mutex); 322static DEFINE_MUTEX(swap_cgroup_mutex);
323struct swap_cgroup_ctrl { 323struct swap_cgroup_ctrl {
diff --git a/mm/page_io.c b/mm/page_io.c
index 34f02923744c..78eee32ee486 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -17,6 +17,7 @@
17#include <linux/swap.h> 17#include <linux/swap.h>
18#include <linux/bio.h> 18#include <linux/bio.h>
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/buffer_head.h>
20#include <linux/writeback.h> 21#include <linux/writeback.h>
21#include <linux/frontswap.h> 22#include <linux/frontswap.h>
22#include <asm/pgtable.h> 23#include <asm/pgtable.h>
@@ -86,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err)
86 bio_put(bio); 87 bio_put(bio);
87} 88}
88 89
90int generic_swapfile_activate(struct swap_info_struct *sis,
91 struct file *swap_file,
92 sector_t *span)
93{
94 struct address_space *mapping = swap_file->f_mapping;
95 struct inode *inode = mapping->host;
96 unsigned blocks_per_page;
97 unsigned long page_no;
98 unsigned blkbits;
99 sector_t probe_block;
100 sector_t last_block;
101 sector_t lowest_block = -1;
102 sector_t highest_block = 0;
103 int nr_extents = 0;
104 int ret;
105
106 blkbits = inode->i_blkbits;
107 blocks_per_page = PAGE_SIZE >> blkbits;
108
109 /*
110 * Map all the blocks into the extent list. This code doesn't try
111 * to be very smart.
112 */
113 probe_block = 0;
114 page_no = 0;
115 last_block = i_size_read(inode) >> blkbits;
116 while ((probe_block + blocks_per_page) <= last_block &&
117 page_no < sis->max) {
118 unsigned block_in_page;
119 sector_t first_block;
120
121 first_block = bmap(inode, probe_block);
122 if (first_block == 0)
123 goto bad_bmap;
124
125 /*
126 * It must be PAGE_SIZE aligned on-disk
127 */
128 if (first_block & (blocks_per_page - 1)) {
129 probe_block++;
130 goto reprobe;
131 }
132
133 for (block_in_page = 1; block_in_page < blocks_per_page;
134 block_in_page++) {
135 sector_t block;
136
137 block = bmap(inode, probe_block + block_in_page);
138 if (block == 0)
139 goto bad_bmap;
140 if (block != first_block + block_in_page) {
141 /* Discontiguity */
142 probe_block++;
143 goto reprobe;
144 }
145 }
146
147 first_block >>= (PAGE_SHIFT - blkbits);
148 if (page_no) { /* exclude the header page */
149 if (first_block < lowest_block)
150 lowest_block = first_block;
151 if (first_block > highest_block)
152 highest_block = first_block;
153 }
154
155 /*
156 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
157 */
158 ret = add_swap_extent(sis, page_no, 1, first_block);
159 if (ret < 0)
160 goto out;
161 nr_extents += ret;
162 page_no++;
163 probe_block += blocks_per_page;
164reprobe:
165 continue;
166 }
167 ret = nr_extents;
168 *span = 1 + highest_block - lowest_block;
169 if (page_no == 0)
170 page_no = 1; /* force Empty message */
171 sis->max = page_no;
172 sis->pages = page_no - 1;
173 sis->highest_bit = page_no - 1;
174out:
175 return ret;
176bad_bmap:
177 printk(KERN_ERR "swapon: swapfile has holes\n");
178 ret = -EINVAL;
179 goto out;
180}
181
89/* 182/*
90 * We may have stale swap cache pages in memory: notice 183 * We may have stale swap cache pages in memory: notice
91 * them here and get rid of the unnecessary final write. 184 * them here and get rid of the unnecessary final write.
@@ -94,6 +187,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
94{ 187{
95 struct bio *bio; 188 struct bio *bio;
96 int ret = 0, rw = WRITE; 189 int ret = 0, rw = WRITE;
190 struct swap_info_struct *sis = page_swap_info(page);
97 191
98 if (try_to_free_swap(page)) { 192 if (try_to_free_swap(page)) {
99 unlock_page(page); 193 unlock_page(page);
@@ -105,6 +199,33 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
105 end_page_writeback(page); 199 end_page_writeback(page);
106 goto out; 200 goto out;
107 } 201 }
202
203 if (sis->flags & SWP_FILE) {
204 struct kiocb kiocb;
205 struct file *swap_file = sis->swap_file;
206 struct address_space *mapping = swap_file->f_mapping;
207 struct iovec iov = {
208 .iov_base = kmap(page),
209 .iov_len = PAGE_SIZE,
210 };
211
212 init_sync_kiocb(&kiocb, swap_file);
213 kiocb.ki_pos = page_file_offset(page);
214 kiocb.ki_left = PAGE_SIZE;
215 kiocb.ki_nbytes = PAGE_SIZE;
216
217 unlock_page(page);
218 ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
219 &kiocb, &iov,
220 kiocb.ki_pos, 1);
221 kunmap(page);
222 if (ret == PAGE_SIZE) {
223 count_vm_event(PSWPOUT);
224 ret = 0;
225 }
226 return ret;
227 }
228
108 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); 229 bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
109 if (bio == NULL) { 230 if (bio == NULL) {
110 set_page_dirty(page); 231 set_page_dirty(page);
@@ -126,6 +247,7 @@ int swap_readpage(struct page *page)
126{ 247{
127 struct bio *bio; 248 struct bio *bio;
128 int ret = 0; 249 int ret = 0;
250 struct swap_info_struct *sis = page_swap_info(page);
129 251
130 VM_BUG_ON(!PageLocked(page)); 252 VM_BUG_ON(!PageLocked(page));
131 VM_BUG_ON(PageUptodate(page)); 253 VM_BUG_ON(PageUptodate(page));
@@ -134,6 +256,17 @@ int swap_readpage(struct page *page)
134 unlock_page(page); 256 unlock_page(page);
135 goto out; 257 goto out;
136 } 258 }
259
260 if (sis->flags & SWP_FILE) {
261 struct file *swap_file = sis->swap_file;
262 struct address_space *mapping = swap_file->f_mapping;
263
264 ret = mapping->a_ops->readpage(swap_file, page);
265 if (!ret)
266 count_vm_event(PSWPIN);
267 return ret;
268 }
269
137 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); 270 bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
138 if (bio == NULL) { 271 if (bio == NULL) {
139 unlock_page(page); 272 unlock_page(page);
@@ -145,3 +278,15 @@ int swap_readpage(struct page *page)
145out: 278out:
146 return ret; 279 return ret;
147} 280}
281
282int swap_set_page_dirty(struct page *page)
283{
284 struct swap_info_struct *sis = page_swap_info(page);
285
286 if (sis->flags & SWP_FILE) {
287 struct address_space *mapping = sis->swap_file->f_mapping;
288 return mapping->a_ops->set_page_dirty(page);
289 } else {
290 return __set_page_dirty_no_writeback(page);
291 }
292}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c9f04774f2b8..247d1f175739 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -5,8 +5,101 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/page-isolation.h> 6#include <linux/page-isolation.h>
7#include <linux/pageblock-flags.h> 7#include <linux/pageblock-flags.h>
8#include <linux/memory.h>
8#include "internal.h" 9#include "internal.h"
9 10
11/* called while holding zone->lock */
12static void set_pageblock_isolate(struct page *page)
13{
14 if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
15 return;
16
17 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
18 page_zone(page)->nr_pageblock_isolate++;
19}
20
21/* called while holding zone->lock */
22static void restore_pageblock_isolate(struct page *page, int migratetype)
23{
24 struct zone *zone = page_zone(page);
25 if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
26 return;
27
28 BUG_ON(zone->nr_pageblock_isolate <= 0);
29 set_pageblock_migratetype(page, migratetype);
30 zone->nr_pageblock_isolate--;
31}
32
33int set_migratetype_isolate(struct page *page)
34{
35 struct zone *zone;
36 unsigned long flags, pfn;
37 struct memory_isolate_notify arg;
38 int notifier_ret;
39 int ret = -EBUSY;
40
41 zone = page_zone(page);
42
43 spin_lock_irqsave(&zone->lock, flags);
44
45 pfn = page_to_pfn(page);
46 arg.start_pfn = pfn;
47 arg.nr_pages = pageblock_nr_pages;
48 arg.pages_found = 0;
49
50 /*
51 * It may be possible to isolate a pageblock even if the
52 * migratetype is not MIGRATE_MOVABLE. The memory isolation
53 * notifier chain is used by balloon drivers to return the
54 * number of pages in a range that are held by the balloon
55 * driver to shrink memory. If all the pages are accounted for
56 * by balloons, are free, or on the LRU, isolation can continue.
57 * Later, for example, when memory hotplug notifier runs, these
58 * pages reported as "can be isolated" should be isolated(freed)
59 * by the balloon driver through the memory notifier chain.
60 */
61 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
62 notifier_ret = notifier_to_errno(notifier_ret);
63 if (notifier_ret)
64 goto out;
65 /*
66 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
67 * We just check MOVABLE pages.
68 */
69 if (!has_unmovable_pages(zone, page, arg.pages_found))
70 ret = 0;
71
72 /*
73 * immobile means "not-on-lru" paes. If immobile is larger than
74 * removable-by-driver pages reported by notifier, we'll fail.
75 */
76
77out:
78 if (!ret) {
79 set_pageblock_isolate(page);
80 move_freepages_block(zone, page, MIGRATE_ISOLATE);
81 }
82
83 spin_unlock_irqrestore(&zone->lock, flags);
84 if (!ret)
85 drain_all_pages();
86 return ret;
87}
88
89void unset_migratetype_isolate(struct page *page, unsigned migratetype)
90{
91 struct zone *zone;
92 unsigned long flags;
93 zone = page_zone(page);
94 spin_lock_irqsave(&zone->lock, flags);
95 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
96 goto out;
97 move_freepages_block(zone, page, migratetype);
98 restore_pageblock_isolate(page, migratetype);
99out:
100 spin_unlock_irqrestore(&zone->lock, flags);
101}
102
10static inline struct page * 103static inline struct page *
11__first_valid_page(unsigned long pfn, unsigned long nr_pages) 104__first_valid_page(unsigned long pfn, unsigned long nr_pages)
12{ 105{
diff --git a/mm/shmem.c b/mm/shmem.c
index c15b998e5a86..d4e184e2a38e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -929,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
929 929
930 /* Create a pseudo vma that just contains the policy */ 930 /* Create a pseudo vma that just contains the policy */
931 pvma.vm_start = 0; 931 pvma.vm_start = 0;
932 pvma.vm_pgoff = index; 932 /* Bias interleave by inode number to distribute better across nodes */
933 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
933 pvma.vm_ops = NULL; 934 pvma.vm_ops = NULL;
934 pvma.vm_policy = spol; 935 pvma.vm_policy = spol;
935 return swapin_readahead(swap, gfp, &pvma, 0); 936 return swapin_readahead(swap, gfp, &pvma, 0);
@@ -942,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp,
942 943
943 /* Create a pseudo vma that just contains the policy */ 944 /* Create a pseudo vma that just contains the policy */
944 pvma.vm_start = 0; 945 pvma.vm_start = 0;
945 pvma.vm_pgoff = index; 946 /* Bias interleave by inode number to distribute better across nodes */
947 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
946 pvma.vm_ops = NULL; 948 pvma.vm_ops = NULL;
947 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 949 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
948 950
diff --git a/mm/slab.c b/mm/slab.c
index 1fcf3ac94b6c..f8b0d539b482 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -118,12 +118,16 @@
118#include <linux/memory.h> 118#include <linux/memory.h>
119#include <linux/prefetch.h> 119#include <linux/prefetch.h>
120 120
121#include <net/sock.h>
122
121#include <asm/cacheflush.h> 123#include <asm/cacheflush.h>
122#include <asm/tlbflush.h> 124#include <asm/tlbflush.h>
123#include <asm/page.h> 125#include <asm/page.h>
124 126
125#include <trace/events/kmem.h> 127#include <trace/events/kmem.h>
126 128
129#include "internal.h"
130
127/* 131/*
128 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. 132 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
129 * 0 for faster, smaller code (especially in the critical paths). 133 * 0 for faster, smaller code (especially in the critical paths).
@@ -152,6 +156,12 @@
152#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 156#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
153#endif 157#endif
154 158
159/*
160 * true if a page was allocated from pfmemalloc reserves for network-based
161 * swap
162 */
163static bool pfmemalloc_active __read_mostly;
164
155/* Legal flag mask for kmem_cache_create(). */ 165/* Legal flag mask for kmem_cache_create(). */
156#if DEBUG 166#if DEBUG
157# define CREATE_MASK (SLAB_RED_ZONE | \ 167# define CREATE_MASK (SLAB_RED_ZONE | \
@@ -257,9 +267,30 @@ struct array_cache {
257 * Must have this definition in here for the proper 267 * Must have this definition in here for the proper
258 * alignment of array_cache. Also simplifies accessing 268 * alignment of array_cache. Also simplifies accessing
259 * the entries. 269 * the entries.
270 *
271 * Entries should not be directly dereferenced as
272 * entries belonging to slabs marked pfmemalloc will
273 * have the lower bits set SLAB_OBJ_PFMEMALLOC
260 */ 274 */
261}; 275};
262 276
277#define SLAB_OBJ_PFMEMALLOC 1
278static inline bool is_obj_pfmemalloc(void *objp)
279{
280 return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
281}
282
283static inline void set_obj_pfmemalloc(void **objp)
284{
285 *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
286 return;
287}
288
289static inline void clear_obj_pfmemalloc(void **objp)
290{
291 *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
292}
293
263/* 294/*
264 * bootstrap: The caches do not work without cpuarrays anymore, but the 295 * bootstrap: The caches do not work without cpuarrays anymore, but the
265 * cpuarrays are allocated from the generic caches... 296 * cpuarrays are allocated from the generic caches...
@@ -900,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries,
900 return nc; 931 return nc;
901} 932}
902 933
934static inline bool is_slab_pfmemalloc(struct slab *slabp)
935{
936 struct page *page = virt_to_page(slabp->s_mem);
937
938 return PageSlabPfmemalloc(page);
939}
940
941/* Clears pfmemalloc_active if no slabs have pfmalloc set */
942static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
943 struct array_cache *ac)
944{
945 struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
946 struct slab *slabp;
947 unsigned long flags;
948
949 if (!pfmemalloc_active)
950 return;
951
952 spin_lock_irqsave(&l3->list_lock, flags);
953 list_for_each_entry(slabp, &l3->slabs_full, list)
954 if (is_slab_pfmemalloc(slabp))
955 goto out;
956
957 list_for_each_entry(slabp, &l3->slabs_partial, list)
958 if (is_slab_pfmemalloc(slabp))
959 goto out;
960
961 list_for_each_entry(slabp, &l3->slabs_free, list)
962 if (is_slab_pfmemalloc(slabp))
963 goto out;
964
965 pfmemalloc_active = false;
966out:
967 spin_unlock_irqrestore(&l3->list_lock, flags);
968}
969
970static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
971 gfp_t flags, bool force_refill)
972{
973 int i;
974 void *objp = ac->entry[--ac->avail];
975
976 /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
977 if (unlikely(is_obj_pfmemalloc(objp))) {
978 struct kmem_list3 *l3;
979
980 if (gfp_pfmemalloc_allowed(flags)) {
981 clear_obj_pfmemalloc(&objp);
982 return objp;
983 }
984
985 /* The caller cannot use PFMEMALLOC objects, find another one */
986 for (i = 1; i < ac->avail; i++) {
987 /* If a !PFMEMALLOC object is found, swap them */
988 if (!is_obj_pfmemalloc(ac->entry[i])) {
989 objp = ac->entry[i];
990 ac->entry[i] = ac->entry[ac->avail];
991 ac->entry[ac->avail] = objp;
992 return objp;
993 }
994 }
995
996 /*
997 * If there are empty slabs on the slabs_free list and we are
998 * being forced to refill the cache, mark this one !pfmemalloc.
999 */
1000 l3 = cachep->nodelists[numa_mem_id()];
1001 if (!list_empty(&l3->slabs_free) && force_refill) {
1002 struct slab *slabp = virt_to_slab(objp);
1003 ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem));
1004 clear_obj_pfmemalloc(&objp);
1005 recheck_pfmemalloc_active(cachep, ac);
1006 return objp;
1007 }
1008
1009 /* No !PFMEMALLOC objects available */
1010 ac->avail++;
1011 objp = NULL;
1012 }
1013
1014 return objp;
1015}
1016
1017static inline void *ac_get_obj(struct kmem_cache *cachep,
1018 struct array_cache *ac, gfp_t flags, bool force_refill)
1019{
1020 void *objp;
1021
1022 if (unlikely(sk_memalloc_socks()))
1023 objp = __ac_get_obj(cachep, ac, flags, force_refill);
1024 else
1025 objp = ac->entry[--ac->avail];
1026
1027 return objp;
1028}
1029
1030static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
1031 void *objp)
1032{
1033 if (unlikely(pfmemalloc_active)) {
1034 /* Some pfmemalloc slabs exist, check if this is one */
1035 struct page *page = virt_to_page(objp);
1036 if (PageSlabPfmemalloc(page))
1037 set_obj_pfmemalloc(&objp);
1038 }
1039
1040 return objp;
1041}
1042
1043static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
1044 void *objp)
1045{
1046 if (unlikely(sk_memalloc_socks()))
1047 objp = __ac_put_obj(cachep, ac, objp);
1048
1049 ac->entry[ac->avail++] = objp;
1050}
1051
903/* 1052/*
904 * Transfer objects in one arraycache to another. 1053 * Transfer objects in one arraycache to another.
905 * Locking must be handled by the caller. 1054 * Locking must be handled by the caller.
@@ -1076,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1076 STATS_INC_ACOVERFLOW(cachep); 1225 STATS_INC_ACOVERFLOW(cachep);
1077 __drain_alien_cache(cachep, alien, nodeid); 1226 __drain_alien_cache(cachep, alien, nodeid);
1078 } 1227 }
1079 alien->entry[alien->avail++] = objp; 1228 ac_put_obj(cachep, alien, objp);
1080 spin_unlock(&alien->lock); 1229 spin_unlock(&alien->lock);
1081 } else { 1230 } else {
1082 spin_lock(&(cachep->nodelists[nodeid])->list_lock); 1231 spin_lock(&(cachep->nodelists[nodeid])->list_lock);
@@ -1759,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1759 return NULL; 1908 return NULL;
1760 } 1909 }
1761 1910
1911 /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
1912 if (unlikely(page->pfmemalloc))
1913 pfmemalloc_active = true;
1914
1762 nr_pages = (1 << cachep->gfporder); 1915 nr_pages = (1 << cachep->gfporder);
1763 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1916 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1764 add_zone_page_state(page_zone(page), 1917 add_zone_page_state(page_zone(page),
@@ -1766,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1766 else 1919 else
1767 add_zone_page_state(page_zone(page), 1920 add_zone_page_state(page_zone(page),
1768 NR_SLAB_UNRECLAIMABLE, nr_pages); 1921 NR_SLAB_UNRECLAIMABLE, nr_pages);
1769 for (i = 0; i < nr_pages; i++) 1922 for (i = 0; i < nr_pages; i++) {
1770 __SetPageSlab(page + i); 1923 __SetPageSlab(page + i);
1771 1924
1925 if (page->pfmemalloc)
1926 SetPageSlabPfmemalloc(page + i);
1927 }
1928
1772 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { 1929 if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
1773 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); 1930 kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
1774 1931
@@ -1800,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1800 NR_SLAB_UNRECLAIMABLE, nr_freed); 1957 NR_SLAB_UNRECLAIMABLE, nr_freed);
1801 while (i--) { 1958 while (i--) {
1802 BUG_ON(!PageSlab(page)); 1959 BUG_ON(!PageSlab(page));
1960 __ClearPageSlabPfmemalloc(page);
1803 __ClearPageSlab(page); 1961 __ClearPageSlab(page);
1804 page++; 1962 page++;
1805 } 1963 }
@@ -3015,16 +3173,19 @@ bad:
3015#define check_slabp(x,y) do { } while(0) 3173#define check_slabp(x,y) do { } while(0)
3016#endif 3174#endif
3017 3175
3018static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) 3176static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
3177 bool force_refill)
3019{ 3178{
3020 int batchcount; 3179 int batchcount;
3021 struct kmem_list3 *l3; 3180 struct kmem_list3 *l3;
3022 struct array_cache *ac; 3181 struct array_cache *ac;
3023 int node; 3182 int node;
3024 3183
3025retry:
3026 check_irq_off(); 3184 check_irq_off();
3027 node = numa_mem_id(); 3185 node = numa_mem_id();
3186 if (unlikely(force_refill))
3187 goto force_grow;
3188retry:
3028 ac = cpu_cache_get(cachep); 3189 ac = cpu_cache_get(cachep);
3029 batchcount = ac->batchcount; 3190 batchcount = ac->batchcount;
3030 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 3191 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3074,8 +3235,8 @@ retry:
3074 STATS_INC_ACTIVE(cachep); 3235 STATS_INC_ACTIVE(cachep);
3075 STATS_SET_HIGH(cachep); 3236 STATS_SET_HIGH(cachep);
3076 3237
3077 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, 3238 ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
3078 node); 3239 node));
3079 } 3240 }
3080 check_slabp(cachep, slabp); 3241 check_slabp(cachep, slabp);
3081 3242
@@ -3094,18 +3255,22 @@ alloc_done:
3094 3255
3095 if (unlikely(!ac->avail)) { 3256 if (unlikely(!ac->avail)) {
3096 int x; 3257 int x;
3258force_grow:
3097 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 3259 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3098 3260
3099 /* cache_grow can reenable interrupts, then ac could change. */ 3261 /* cache_grow can reenable interrupts, then ac could change. */
3100 ac = cpu_cache_get(cachep); 3262 ac = cpu_cache_get(cachep);
3101 if (!x && ac->avail == 0) /* no objects in sight? abort */ 3263
3264 /* no objects in sight? abort */
3265 if (!x && (ac->avail == 0 || force_refill))
3102 return NULL; 3266 return NULL;
3103 3267
3104 if (!ac->avail) /* objects refilled by interrupt? */ 3268 if (!ac->avail) /* objects refilled by interrupt? */
3105 goto retry; 3269 goto retry;
3106 } 3270 }
3107 ac->touched = 1; 3271 ac->touched = 1;
3108 return ac->entry[--ac->avail]; 3272
3273 return ac_get_obj(cachep, ac, flags, force_refill);
3109} 3274}
3110 3275
3111static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 3276static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3187,23 +3352,35 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3187{ 3352{
3188 void *objp; 3353 void *objp;
3189 struct array_cache *ac; 3354 struct array_cache *ac;
3355 bool force_refill = false;
3190 3356
3191 check_irq_off(); 3357 check_irq_off();
3192 3358
3193 ac = cpu_cache_get(cachep); 3359 ac = cpu_cache_get(cachep);
3194 if (likely(ac->avail)) { 3360 if (likely(ac->avail)) {
3195 STATS_INC_ALLOCHIT(cachep);
3196 ac->touched = 1; 3361 ac->touched = 1;
3197 objp = ac->entry[--ac->avail]; 3362 objp = ac_get_obj(cachep, ac, flags, false);
3198 } else { 3363
3199 STATS_INC_ALLOCMISS(cachep);
3200 objp = cache_alloc_refill(cachep, flags);
3201 /* 3364 /*
3202 * the 'ac' may be updated by cache_alloc_refill(), 3365 * Allow for the possibility all avail objects are not allowed
3203 * and kmemleak_erase() requires its correct value. 3366 * by the current flags
3204 */ 3367 */
3205 ac = cpu_cache_get(cachep); 3368 if (objp) {
3369 STATS_INC_ALLOCHIT(cachep);
3370 goto out;
3371 }
3372 force_refill = true;
3206 } 3373 }
3374
3375 STATS_INC_ALLOCMISS(cachep);
3376 objp = cache_alloc_refill(cachep, flags, force_refill);
3377 /*
3378 * the 'ac' may be updated by cache_alloc_refill(),
3379 * and kmemleak_erase() requires its correct value.
3380 */
3381 ac = cpu_cache_get(cachep);
3382
3383out:
3207 /* 3384 /*
3208 * To avoid a false negative, if an object that is in one of the 3385 * To avoid a false negative, if an object that is in one of the
3209 * per-CPU caches is leaked, we need to make sure kmemleak doesn't 3386 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
@@ -3525,9 +3702,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3525 struct kmem_list3 *l3; 3702 struct kmem_list3 *l3;
3526 3703
3527 for (i = 0; i < nr_objects; i++) { 3704 for (i = 0; i < nr_objects; i++) {
3528 void *objp = objpp[i]; 3705 void *objp;
3529 struct slab *slabp; 3706 struct slab *slabp;
3530 3707
3708 clear_obj_pfmemalloc(&objpp[i]);
3709 objp = objpp[i];
3710
3531 slabp = virt_to_slab(objp); 3711 slabp = virt_to_slab(objp);
3532 l3 = cachep->nodelists[node]; 3712 l3 = cachep->nodelists[node];
3533 list_del(&slabp->list); 3713 list_del(&slabp->list);
@@ -3645,7 +3825,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3645 cache_flusharray(cachep, ac); 3825 cache_flusharray(cachep, ac);
3646 } 3826 }
3647 3827
3648 ac->entry[ac->avail++] = objp; 3828 ac_put_obj(cachep, ac, objp);
3649} 3829}
3650 3830
3651/** 3831/**
diff --git a/mm/slub.c b/mm/slub.c
index e517d435e5dc..8f78e2577031 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -34,6 +34,8 @@
34 34
35#include <trace/events/kmem.h> 35#include <trace/events/kmem.h>
36 36
37#include "internal.h"
38
37/* 39/*
38 * Lock order: 40 * Lock order:
39 * 1. slab_mutex (Global Mutex) 41 * 1. slab_mutex (Global Mutex)
@@ -1354,6 +1356,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1354 inc_slabs_node(s, page_to_nid(page), page->objects); 1356 inc_slabs_node(s, page_to_nid(page), page->objects);
1355 page->slab = s; 1357 page->slab = s;
1356 __SetPageSlab(page); 1358 __SetPageSlab(page);
1359 if (page->pfmemalloc)
1360 SetPageSlabPfmemalloc(page);
1357 1361
1358 start = page_address(page); 1362 start = page_address(page);
1359 1363
@@ -1397,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1397 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1401 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1398 -pages); 1402 -pages);
1399 1403
1404 __ClearPageSlabPfmemalloc(page);
1400 __ClearPageSlab(page); 1405 __ClearPageSlab(page);
1401 reset_page_mapcount(page); 1406 reset_page_mapcount(page);
1402 if (current->reclaim_state) 1407 if (current->reclaim_state)
@@ -2126,6 +2131,14 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2126 return freelist; 2131 return freelist;
2127} 2132}
2128 2133
2134static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
2135{
2136 if (unlikely(PageSlabPfmemalloc(page)))
2137 return gfp_pfmemalloc_allowed(gfpflags);
2138
2139 return true;
2140}
2141
2129/* 2142/*
2130 * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist 2143 * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist
2131 * or deactivate the page. 2144 * or deactivate the page.
@@ -2206,6 +2219,18 @@ redo:
2206 goto new_slab; 2219 goto new_slab;
2207 } 2220 }
2208 2221
2222 /*
2223 * By rights, we should be searching for a slab page that was
2224 * PFMEMALLOC but right now, we are losing the pfmemalloc
2225 * information when the page leaves the per-cpu allocator
2226 */
2227 if (unlikely(!pfmemalloc_match(page, gfpflags))) {
2228 deactivate_slab(s, page, c->freelist);
2229 c->page = NULL;
2230 c->freelist = NULL;
2231 goto new_slab;
2232 }
2233
2209 /* must check again c->freelist in case of cpu migration or IRQ */ 2234 /* must check again c->freelist in case of cpu migration or IRQ */
2210 freelist = c->freelist; 2235 freelist = c->freelist;
2211 if (freelist) 2236 if (freelist)
@@ -2256,11 +2281,11 @@ new_slab:
2256 } 2281 }
2257 2282
2258 page = c->page; 2283 page = c->page;
2259 if (likely(!kmem_cache_debug(s))) 2284 if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
2260 goto load_freelist; 2285 goto load_freelist;
2261 2286
2262 /* Only entered in the debug case */ 2287 /* Only entered in the debug case */
2263 if (!alloc_debug_processing(s, page, freelist, addr)) 2288 if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
2264 goto new_slab; /* Slab failed checks. Next slab needed */ 2289 goto new_slab; /* Slab failed checks. Next slab needed */
2265 2290
2266 deactivate_slab(s, page, get_freepointer(s, freelist)); 2291 deactivate_slab(s, page, get_freepointer(s, freelist));
@@ -2313,7 +2338,6 @@ redo:
2313 object = c->freelist; 2338 object = c->freelist;
2314 page = c->page; 2339 page = c->page;
2315 if (unlikely(!object || !node_match(page, node))) 2340 if (unlikely(!object || !node_match(page, node)))
2316
2317 object = __slab_alloc(s, gfpflags, node, addr, c); 2341 object = __slab_alloc(s, gfpflags, node, addr, c);
2318 2342
2319 else { 2343 else {
diff --git a/mm/sparse.c b/mm/sparse.c
index c7bb952400c8..fac95f2888f2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
65 65
66 if (slab_is_available()) { 66 if (slab_is_available()) {
67 if (node_state(nid, N_HIGH_MEMORY)) 67 if (node_state(nid, N_HIGH_MEMORY))
68 section = kmalloc_node(array_size, GFP_KERNEL, nid); 68 section = kzalloc_node(array_size, GFP_KERNEL, nid);
69 else 69 else
70 section = kmalloc(array_size, GFP_KERNEL); 70 section = kzalloc(array_size, GFP_KERNEL);
71 } else 71 } else {
72 section = alloc_bootmem_node(NODE_DATA(nid), array_size); 72 section = alloc_bootmem_node(NODE_DATA(nid), array_size);
73 73 }
74 if (section)
75 memset(section, 0, array_size);
76 74
77 return section; 75 return section;
78} 76}
79 77
80static int __meminit sparse_index_init(unsigned long section_nr, int nid) 78static int __meminit sparse_index_init(unsigned long section_nr, int nid)
81{ 79{
82 static DEFINE_SPINLOCK(index_init_lock);
83 unsigned long root = SECTION_NR_TO_ROOT(section_nr); 80 unsigned long root = SECTION_NR_TO_ROOT(section_nr);
84 struct mem_section *section; 81 struct mem_section *section;
85 int ret = 0; 82 int ret = 0;
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
90 section = sparse_index_alloc(nid); 87 section = sparse_index_alloc(nid);
91 if (!section) 88 if (!section)
92 return -ENOMEM; 89 return -ENOMEM;
93 /*
94 * This lock keeps two different sections from
95 * reallocating for the same index
96 */
97 spin_lock(&index_init_lock);
98
99 if (mem_section[root]) {
100 ret = -EEXIST;
101 goto out;
102 }
103 90
104 mem_section[root] = section; 91 mem_section[root] = section;
105out: 92
106 spin_unlock(&index_init_lock);
107 return ret; 93 return ret;
108} 94}
109#else /* !SPARSEMEM_EXTREME */ 95#else /* !SPARSEMEM_EXTREME */
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms)
132 break; 118 break;
133 } 119 }
134 120
121 VM_BUG_ON(root_nr == NR_SECTION_ROOTS);
122
135 return (root_nr * SECTIONS_PER_ROOT) + (ms - root); 123 return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
136} 124}
137 125
@@ -493,6 +481,9 @@ void __init sparse_init(void)
493 struct page **map_map; 481 struct page **map_map;
494#endif 482#endif
495 483
484 /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
485 set_pageblock_order();
486
496 /* 487 /*
497 * map is using big page (aka 2M in x86 64 bit) 488 * map is using big page (aka 2M in x86 64 bit)
498 * usemap is less one page (aka 24 bytes) 489 * usemap is less one page (aka 24 bytes)
diff --git a/mm/swap.c b/mm/swap.c
index 4e7e2ec67078..77825883298f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -236,6 +236,58 @@ void put_pages_list(struct list_head *pages)
236} 236}
237EXPORT_SYMBOL(put_pages_list); 237EXPORT_SYMBOL(put_pages_list);
238 238
239/*
240 * get_kernel_pages() - pin kernel pages in memory
241 * @kiov: An array of struct kvec structures
242 * @nr_segs: number of segments to pin
243 * @write: pinning for read/write, currently ignored
244 * @pages: array that receives pointers to the pages pinned.
245 * Should be at least nr_segs long.
246 *
247 * Returns number of pages pinned. This may be fewer than the number
248 * requested. If nr_pages is 0 or negative, returns 0. If no pages
249 * were pinned, returns -errno. Each page returned must be released
250 * with a put_page() call when it is finished with.
251 */
252int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
253 struct page **pages)
254{
255 int seg;
256
257 for (seg = 0; seg < nr_segs; seg++) {
258 if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
259 return seg;
260
261 pages[seg] = kmap_to_page(kiov[seg].iov_base);
262 page_cache_get(pages[seg]);
263 }
264
265 return seg;
266}
267EXPORT_SYMBOL_GPL(get_kernel_pages);
268
269/*
270 * get_kernel_page() - pin a kernel page in memory
271 * @start: starting kernel address
272 * @write: pinning for read/write, currently ignored
273 * @pages: array that receives pointer to the page pinned.
274 * Must be at least nr_segs long.
275 *
276 * Returns 1 if page is pinned. If the page was not pinned, returns
277 * -errno. The page returned must be released with a put_page() call
278 * when it is finished with.
279 */
280int get_kernel_page(unsigned long start, int write, struct page **pages)
281{
282 const struct kvec kiov = {
283 .iov_base = (void *)start,
284 .iov_len = PAGE_SIZE
285 };
286
287 return get_kernel_pages(&kiov, 1, write, pages);
288}
289EXPORT_SYMBOL_GPL(get_kernel_page);
290
239static void pagevec_lru_move_fn(struct pagevec *pvec, 291static void pagevec_lru_move_fn(struct pagevec *pvec,
240 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), 292 void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
241 void *arg) 293 void *arg)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4c5ff7f284d9..0cb36fb1f61c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/blkdev.h>
17#include <linux/pagevec.h> 18#include <linux/pagevec.h>
18#include <linux/migrate.h> 19#include <linux/migrate.h>
19#include <linux/page_cgroup.h> 20#include <linux/page_cgroup.h>
@@ -26,7 +27,7 @@
26 */ 27 */
27static const struct address_space_operations swap_aops = { 28static const struct address_space_operations swap_aops = {
28 .writepage = swap_writepage, 29 .writepage = swap_writepage,
29 .set_page_dirty = __set_page_dirty_no_writeback, 30 .set_page_dirty = swap_set_page_dirty,
30 .migratepage = migrate_page, 31 .migratepage = migrate_page,
31}; 32};
32 33
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
376 unsigned long offset = swp_offset(entry); 377 unsigned long offset = swp_offset(entry);
377 unsigned long start_offset, end_offset; 378 unsigned long start_offset, end_offset;
378 unsigned long mask = (1UL << page_cluster) - 1; 379 unsigned long mask = (1UL << page_cluster) - 1;
380 struct blk_plug plug;
379 381
380 /* Read a page_cluster sized and aligned cluster around offset. */ 382 /* Read a page_cluster sized and aligned cluster around offset. */
381 start_offset = offset & ~mask; 383 start_offset = offset & ~mask;
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
383 if (!start_offset) /* First page is swap header. */ 385 if (!start_offset) /* First page is swap header. */
384 start_offset++; 386 start_offset++;
385 387
388 blk_start_plug(&plug);
386 for (offset = start_offset; offset <= end_offset ; offset++) { 389 for (offset = start_offset; offset <= end_offset ; offset++) {
387 /* Ok, do the async read-ahead now */ 390 /* Ok, do the async read-ahead now */
388 page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 391 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
391 continue; 394 continue;
392 page_cache_release(page); 395 page_cache_release(page);
393 } 396 }
397 blk_finish_plug(&plug);
398
394 lru_add_drain(); /* Push any new pages onto the LRU now */ 399 lru_add_drain(); /* Push any new pages onto the LRU now */
395 return read_swap_cache_async(entry, gfp_mask, vma, addr); 400 return read_swap_cache_async(entry, gfp_mask, vma, addr);
396} 401}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 71373d03fcee..14e254c768fc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,6 +33,7 @@
33#include <linux/oom.h> 33#include <linux/oom.h>
34#include <linux/frontswap.h> 34#include <linux/frontswap.h>
35#include <linux/swapfile.h> 35#include <linux/swapfile.h>
36#include <linux/export.h>
36 37
37#include <asm/pgtable.h> 38#include <asm/pgtable.h>
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
@@ -548,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
548 549
549 /* free if no reference */ 550 /* free if no reference */
550 if (!usage) { 551 if (!usage) {
551 struct gendisk *disk = p->bdev->bd_disk;
552 if (offset < p->lowest_bit) 552 if (offset < p->lowest_bit)
553 p->lowest_bit = offset; 553 p->lowest_bit = offset;
554 if (offset > p->highest_bit) 554 if (offset > p->highest_bit)
@@ -559,9 +559,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
559 nr_swap_pages++; 559 nr_swap_pages++;
560 p->inuse_pages--; 560 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset); 561 frontswap_invalidate_page(p->type, offset);
562 if ((p->flags & SWP_BLKDEV) && 562 if (p->flags & SWP_BLKDEV) {
563 disk->fops->swap_slot_free_notify) 563 struct gendisk *disk = p->bdev->bd_disk;
564 disk->fops->swap_slot_free_notify(p->bdev, offset); 564 if (disk->fops->swap_slot_free_notify)
565 disk->fops->swap_slot_free_notify(p->bdev,
566 offset);
567 }
565 } 568 }
566 569
567 return usage; 570 return usage;
@@ -832,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
832 835
833 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 836 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
834 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 837 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
835 if (ret > 0) 838 mem_cgroup_cancel_charge_swapin(memcg);
836 mem_cgroup_cancel_charge_swapin(memcg);
837 ret = 0; 839 ret = 0;
838 goto out; 840 goto out;
839 } 841 }
@@ -1328,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
1328 list_del(&se->list); 1330 list_del(&se->list);
1329 kfree(se); 1331 kfree(se);
1330 } 1332 }
1333
1334 if (sis->flags & SWP_FILE) {
1335 struct file *swap_file = sis->swap_file;
1336 struct address_space *mapping = swap_file->f_mapping;
1337
1338 sis->flags &= ~SWP_FILE;
1339 mapping->a_ops->swap_deactivate(swap_file);
1340 }
1331} 1341}
1332 1342
1333/* 1343/*
@@ -1336,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
1336 * 1346 *
1337 * This function rather assumes that it is called in ascending page order. 1347 * This function rather assumes that it is called in ascending page order.
1338 */ 1348 */
1339static int 1349int
1340add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, 1350add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1341 unsigned long nr_pages, sector_t start_block) 1351 unsigned long nr_pages, sector_t start_block)
1342{ 1352{
@@ -1409,98 +1419,28 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
1409 */ 1419 */
1410static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) 1420static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1411{ 1421{
1412 struct inode *inode; 1422 struct file *swap_file = sis->swap_file;
1413 unsigned blocks_per_page; 1423 struct address_space *mapping = swap_file->f_mapping;
1414 unsigned long page_no; 1424 struct inode *inode = mapping->host;
1415 unsigned blkbits;
1416 sector_t probe_block;
1417 sector_t last_block;
1418 sector_t lowest_block = -1;
1419 sector_t highest_block = 0;
1420 int nr_extents = 0;
1421 int ret; 1425 int ret;
1422 1426
1423 inode = sis->swap_file->f_mapping->host;
1424 if (S_ISBLK(inode->i_mode)) { 1427 if (S_ISBLK(inode->i_mode)) {
1425 ret = add_swap_extent(sis, 0, sis->max, 0); 1428 ret = add_swap_extent(sis, 0, sis->max, 0);
1426 *span = sis->pages; 1429 *span = sis->pages;
1427 goto out; 1430 return ret;
1428 } 1431 }
1429 1432
1430 blkbits = inode->i_blkbits; 1433 if (mapping->a_ops->swap_activate) {
1431 blocks_per_page = PAGE_SIZE >> blkbits; 1434 ret = mapping->a_ops->swap_activate(sis, swap_file, span);
1432 1435 if (!ret) {
1433 /* 1436 sis->flags |= SWP_FILE;
1434 * Map all the blocks into the extent list. This code doesn't try 1437 ret = add_swap_extent(sis, 0, sis->max, 0);
1435 * to be very smart. 1438 *span = sis->pages;
1436 */
1437 probe_block = 0;
1438 page_no = 0;
1439 last_block = i_size_read(inode) >> blkbits;
1440 while ((probe_block + blocks_per_page) <= last_block &&
1441 page_no < sis->max) {
1442 unsigned block_in_page;
1443 sector_t first_block;
1444
1445 first_block = bmap(inode, probe_block);
1446 if (first_block == 0)
1447 goto bad_bmap;
1448
1449 /*
1450 * It must be PAGE_SIZE aligned on-disk
1451 */
1452 if (first_block & (blocks_per_page - 1)) {
1453 probe_block++;
1454 goto reprobe;
1455 }
1456
1457 for (block_in_page = 1; block_in_page < blocks_per_page;
1458 block_in_page++) {
1459 sector_t block;
1460
1461 block = bmap(inode, probe_block + block_in_page);
1462 if (block == 0)
1463 goto bad_bmap;
1464 if (block != first_block + block_in_page) {
1465 /* Discontiguity */
1466 probe_block++;
1467 goto reprobe;
1468 }
1469 }
1470
1471 first_block >>= (PAGE_SHIFT - blkbits);
1472 if (page_no) { /* exclude the header page */
1473 if (first_block < lowest_block)
1474 lowest_block = first_block;
1475 if (first_block > highest_block)
1476 highest_block = first_block;
1477 } 1439 }
1440 return ret;
1441 }
1478 1442
1479 /* 1443 return generic_swapfile_activate(sis, swap_file, span);
1480 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
1481 */
1482 ret = add_swap_extent(sis, page_no, 1, first_block);
1483 if (ret < 0)
1484 goto out;
1485 nr_extents += ret;
1486 page_no++;
1487 probe_block += blocks_per_page;
1488reprobe:
1489 continue;
1490 }
1491 ret = nr_extents;
1492 *span = 1 + highest_block - lowest_block;
1493 if (page_no == 0)
1494 page_no = 1; /* force Empty message */
1495 sis->max = page_no;
1496 sis->pages = page_no - 1;
1497 sis->highest_bit = page_no - 1;
1498out:
1499 return ret;
1500bad_bmap:
1501 printk(KERN_ERR "swapon: swapfile has holes\n");
1502 ret = -EINVAL;
1503 goto out;
1504} 1444}
1505 1445
1506static void enable_swap_info(struct swap_info_struct *p, int prio, 1446static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2285,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry)
2285 return __swap_duplicate(entry, SWAP_HAS_CACHE); 2225 return __swap_duplicate(entry, SWAP_HAS_CACHE);
2286} 2226}
2287 2227
2228struct swap_info_struct *page_swap_info(struct page *page)
2229{
2230 swp_entry_t swap = { .val = page_private(page) };
2231 BUG_ON(!PageSwapCache(page));
2232 return swap_info[swp_type(swap)];
2233}
2234
2235/*
2236 * out-of-line __page_file_ methods to avoid include hell.
2237 */
2238struct address_space *__page_file_mapping(struct page *page)
2239{
2240 VM_BUG_ON(!PageSwapCache(page));
2241 return page_swap_info(page)->swap_file->f_mapping;
2242}
2243EXPORT_SYMBOL_GPL(__page_file_mapping);
2244
2245pgoff_t __page_file_index(struct page *page)
2246{
2247 swp_entry_t swap = { .val = page_private(page) };
2248 VM_BUG_ON(!PageSwapCache(page));
2249 return swp_offset(swap);
2250}
2251EXPORT_SYMBOL_GPL(__page_file_index);
2252
2288/* 2253/*
2289 * add_swap_count_continuation - called when a swap count is duplicated 2254 * add_swap_count_continuation - called when a swap count is duplicated
2290 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 2255 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e03f4c7307a5..2bb90b1d241c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -413,11 +413,11 @@ nocache:
413 if (addr + size - 1 < addr) 413 if (addr + size - 1 < addr)
414 goto overflow; 414 goto overflow;
415 415
416 n = rb_next(&first->rb_node); 416 if (list_is_last(&first->list, &vmap_area_list))
417 if (n)
418 first = rb_entry(n, struct vmap_area, rb_node);
419 else
420 goto found; 417 goto found;
418
419 first = list_entry(first->list.next,
420 struct vmap_area, list);
421 } 421 }
422 422
423found: 423found:
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
904 904
905 BUG_ON(size & ~PAGE_MASK); 905 BUG_ON(size & ~PAGE_MASK);
906 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 906 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
907 if (WARN_ON(size == 0)) {
908 /*
909 * Allocating 0 bytes isn't what caller wants since
910 * get_order(0) returns funny result. Just warn and terminate
911 * early.
912 */
913 return NULL;
914 }
907 order = get_order(size); 915 order = get_order(size);
908 916
909again: 917again:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 347b3ff2a478..8d01243d9560 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -133,7 +133,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */
133static LIST_HEAD(shrinker_list); 133static LIST_HEAD(shrinker_list);
134static DECLARE_RWSEM(shrinker_rwsem); 134static DECLARE_RWSEM(shrinker_rwsem);
135 135
136#ifdef CONFIG_CGROUP_MEM_RES_CTLR 136#ifdef CONFIG_MEMCG
137static bool global_reclaim(struct scan_control *sc) 137static bool global_reclaim(struct scan_control *sc)
138{ 138{
139 return !sc->target_mem_cgroup; 139 return !sc->target_mem_cgroup;
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
687 687
688 cond_resched(); 688 cond_resched();
689 689
690 mem_cgroup_uncharge_start();
690 while (!list_empty(page_list)) { 691 while (!list_empty(page_list)) {
691 enum page_references references; 692 enum page_references references;
692 struct address_space *mapping; 693 struct address_space *mapping;
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,
720 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 721 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
721 722
722 if (PageWriteback(page)) { 723 if (PageWriteback(page)) {
723 nr_writeback++; 724 /*
724 unlock_page(page); 725 * memcg doesn't have any dirty pages throttling so we
725 goto keep; 726 * could easily OOM just because too many pages are in
727 * writeback and there is nothing else to reclaim.
728 *
729 * Check __GFP_IO, certainly because a loop driver
730 * thread might enter reclaim, and deadlock if it waits
731 * on a page for which it is needed to do the write
732 * (loop masks off __GFP_IO|__GFP_FS for this reason);
733 * but more thought would probably show more reasons.
734 *
735 * Don't require __GFP_FS, since we're not going into
736 * the FS, just waiting on its writeback completion.
737 * Worryingly, ext4 gfs2 and xfs allocate pages with
738 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
739 * testing may_enter_fs here is liable to OOM on them.
740 */
741 if (global_reclaim(sc) ||
742 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
743 /*
744 * This is slightly racy - end_page_writeback()
745 * might have just cleared PageReclaim, then
746 * setting PageReclaim here end up interpreted
747 * as PageReadahead - but that does not matter
748 * enough to care. What we do want is for this
749 * page to have PageReclaim set next time memcg
750 * reclaim reaches the tests above, so it will
751 * then wait_on_page_writeback() to avoid OOM;
752 * and it's also appropriate in global reclaim.
753 */
754 SetPageReclaim(page);
755 nr_writeback++;
756 goto keep_locked;
757 }
758 wait_on_page_writeback(page);
726 } 759 }
727 760
728 references = page_check_references(page, sc); 761 references = page_check_references(page, sc);
@@ -921,6 +954,7 @@ keep:
921 954
922 list_splice(&ret_pages, page_list); 955 list_splice(&ret_pages, page_list);
923 count_vm_events(PGACTIVATE, pgactivate); 956 count_vm_events(PGACTIVATE, pgactivate);
957 mem_cgroup_uncharge_end();
924 *ret_nr_dirty += nr_dirty; 958 *ret_nr_dirty += nr_dirty;
925 *ret_nr_writeback += nr_writeback; 959 *ret_nr_writeback += nr_writeback;
926 return nr_reclaimed; 960 return nr_reclaimed;
@@ -2112,6 +2146,83 @@ out:
2112 return 0; 2146 return 0;
2113} 2147}
2114 2148
2149static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2150{
2151 struct zone *zone;
2152 unsigned long pfmemalloc_reserve = 0;
2153 unsigned long free_pages = 0;
2154 int i;
2155 bool wmark_ok;
2156
2157 for (i = 0; i <= ZONE_NORMAL; i++) {
2158 zone = &pgdat->node_zones[i];
2159 pfmemalloc_reserve += min_wmark_pages(zone);
2160 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2161 }
2162
2163 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2164
2165 /* kswapd must be awake if processes are being throttled */
2166 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2167 pgdat->classzone_idx = min(pgdat->classzone_idx,
2168 (enum zone_type)ZONE_NORMAL);
2169 wake_up_interruptible(&pgdat->kswapd_wait);
2170 }
2171
2172 return wmark_ok;
2173}
2174
2175/*
2176 * Throttle direct reclaimers if backing storage is backed by the network
2177 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
2178 * depleted. kswapd will continue to make progress and wake the processes
2179 * when the low watermark is reached
2180 */
2181static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2182 nodemask_t *nodemask)
2183{
2184 struct zone *zone;
2185 int high_zoneidx = gfp_zone(gfp_mask);
2186 pg_data_t *pgdat;
2187
2188 /*
2189 * Kernel threads should not be throttled as they may be indirectly
2190 * responsible for cleaning pages necessary for reclaim to make forward
2191 * progress. kjournald for example may enter direct reclaim while
2192 * committing a transaction where throttling it could forcing other
2193 * processes to block on log_wait_commit().
2194 */
2195 if (current->flags & PF_KTHREAD)
2196 return;
2197
2198 /* Check if the pfmemalloc reserves are ok */
2199 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
2200 pgdat = zone->zone_pgdat;
2201 if (pfmemalloc_watermark_ok(pgdat))
2202 return;
2203
2204 /* Account for the throttling */
2205 count_vm_event(PGSCAN_DIRECT_THROTTLE);
2206
2207 /*
2208 * If the caller cannot enter the filesystem, it's possible that it
2209 * is due to the caller holding an FS lock or performing a journal
2210 * transaction in the case of a filesystem like ext[3|4]. In this case,
2211 * it is not safe to block on pfmemalloc_wait as kswapd could be
2212 * blocked waiting on the same lock. Instead, throttle for up to a
2213 * second before continuing.
2214 */
2215 if (!(gfp_mask & __GFP_FS)) {
2216 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2217 pfmemalloc_watermark_ok(pgdat), HZ);
2218 return;
2219 }
2220
2221 /* Throttle until kswapd wakes the process */
2222 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2223 pfmemalloc_watermark_ok(pgdat));
2224}
2225
2115unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2226unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2116 gfp_t gfp_mask, nodemask_t *nodemask) 2227 gfp_t gfp_mask, nodemask_t *nodemask)
2117{ 2228{
@@ -2131,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2131 .gfp_mask = sc.gfp_mask, 2242 .gfp_mask = sc.gfp_mask,
2132 }; 2243 };
2133 2244
2245 throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
2246
2247 /*
2248 * Do not enter reclaim if fatal signal is pending. 1 is returned so
2249 * that the page allocator does not consider triggering OOM
2250 */
2251 if (fatal_signal_pending(current))
2252 return 1;
2253
2134 trace_mm_vmscan_direct_reclaim_begin(order, 2254 trace_mm_vmscan_direct_reclaim_begin(order,
2135 sc.may_writepage, 2255 sc.may_writepage,
2136 gfp_mask); 2256 gfp_mask);
@@ -2142,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2142 return nr_reclaimed; 2262 return nr_reclaimed;
2143} 2263}
2144 2264
2145#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2265#ifdef CONFIG_MEMCG
2146 2266
2147unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, 2267unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2148 gfp_t gfp_mask, bool noswap, 2268 gfp_t gfp_mask, bool noswap,
@@ -2275,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2275 return balanced_pages >= (present_pages >> 2); 2395 return balanced_pages >= (present_pages >> 2);
2276} 2396}
2277 2397
2278/* is kswapd sleeping prematurely? */ 2398/*
2279static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, 2399 * Prepare kswapd for sleeping. This verifies that there are no processes
2400 * waiting in throttle_direct_reclaim() and that watermarks have been met.
2401 *
2402 * Returns true if kswapd is ready to sleep
2403 */
2404static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2280 int classzone_idx) 2405 int classzone_idx)
2281{ 2406{
2282 int i; 2407 int i;
@@ -2285,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2285 2410
2286 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2411 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2287 if (remaining) 2412 if (remaining)
2288 return true; 2413 return false;
2414
2415 /*
2416 * There is a potential race between when kswapd checks its watermarks
2417 * and a process gets throttled. There is also a potential race if
2418 * processes get throttled, kswapd wakes, a large process exits therby
2419 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
2420 * is going to sleep, no process should be sleeping on pfmemalloc_wait
2421 * so wake them now if necessary. If necessary, processes will wake
2422 * kswapd and get throttled again
2423 */
2424 if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
2425 wake_up(&pgdat->pfmemalloc_wait);
2426 return false;
2427 }
2289 2428
2290 /* Check the watermark levels */ 2429 /* Check the watermark levels */
2291 for (i = 0; i <= classzone_idx; i++) { 2430 for (i = 0; i <= classzone_idx; i++) {
@@ -2318,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2318 * must be balanced 2457 * must be balanced
2319 */ 2458 */
2320 if (order) 2459 if (order)
2321 return !pgdat_balanced(pgdat, balanced, classzone_idx); 2460 return pgdat_balanced(pgdat, balanced, classzone_idx);
2322 else 2461 else
2323 return !all_zones_ok; 2462 return all_zones_ok;
2324} 2463}
2325 2464
2326/* 2465/*
@@ -2546,6 +2685,16 @@ loop_again:
2546 } 2685 }
2547 2686
2548 } 2687 }
2688
2689 /*
2690 * If the low watermark is met there is no need for processes
2691 * to be throttled on pfmemalloc_wait as they should not be
2692 * able to safely make forward progress. Wake them
2693 */
2694 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
2695 pfmemalloc_watermark_ok(pgdat))
2696 wake_up(&pgdat->pfmemalloc_wait);
2697
2549 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2698 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2550 break; /* kswapd: all done */ 2699 break; /* kswapd: all done */
2551 /* 2700 /*
@@ -2647,7 +2796,7 @@ out:
2647 } 2796 }
2648 2797
2649 /* 2798 /*
2650 * Return the order we were reclaiming at so sleeping_prematurely() 2799 * Return the order we were reclaiming at so prepare_kswapd_sleep()
2651 * makes a decision on the order we were last reclaiming at. However, 2800 * makes a decision on the order we were last reclaiming at. However,
2652 * if another caller entered the allocator slow path while kswapd 2801 * if another caller entered the allocator slow path while kswapd
2653 * was awake, order will remain at the higher level 2802 * was awake, order will remain at the higher level
@@ -2667,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2667 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2816 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2668 2817
2669 /* Try to sleep for a short interval */ 2818 /* Try to sleep for a short interval */
2670 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2819 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2671 remaining = schedule_timeout(HZ/10); 2820 remaining = schedule_timeout(HZ/10);
2672 finish_wait(&pgdat->kswapd_wait, &wait); 2821 finish_wait(&pgdat->kswapd_wait, &wait);
2673 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2822 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2677,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2677 * After a short sleep, check if it was a premature sleep. If not, then 2826 * After a short sleep, check if it was a premature sleep. If not, then
2678 * go fully to sleep until explicitly woken up. 2827 * go fully to sleep until explicitly woken up.
2679 */ 2828 */
2680 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2829 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2681 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 2830 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2682 2831
2683 /* 2832 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1bbbbd9776ad..df7a6748231d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -745,6 +745,7 @@ const char * const vmstat_text[] = {
745 TEXTS_FOR_ZONES("pgsteal_direct") 745 TEXTS_FOR_ZONES("pgsteal_direct")
746 TEXTS_FOR_ZONES("pgscan_kswapd") 746 TEXTS_FOR_ZONES("pgscan_kswapd")
747 TEXTS_FOR_ZONES("pgscan_direct") 747 TEXTS_FOR_ZONES("pgscan_direct")
748 "pgscan_direct_throttle",
748 749
749#ifdef CONFIG_NUMA 750#ifdef CONFIG_NUMA
750 "zone_reclaim_failed", 751 "zone_reclaim_failed",