diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/bounce.c | 2 | ||||
-rw-r--r-- | mm/compaction.c | 7 | ||||
-rw-r--r-- | mm/filemap.c | 11 | ||||
-rw-r--r-- | mm/huge_memory.c | 10 | ||||
-rw-r--r-- | mm/hugetlb.c | 17 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 5 | ||||
-rw-r--r-- | mm/madvise.c | 5 | ||||
-rw-r--r-- | mm/memcontrol.c | 703 | ||||
-rw-r--r-- | mm/memory-failure.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 20 | ||||
-rw-r--r-- | mm/migrate.c | 4 | ||||
-rw-r--r-- | mm/mlock.c | 9 | ||||
-rw-r--r-- | mm/mprotect.c | 7 | ||||
-rw-r--r-- | mm/mremap.c | 5 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 4 | ||||
-rw-r--r-- | mm/slab_common.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 88 | ||||
-rw-r--r-- | mm/zswap.c | 4 |
22 files changed, 589 insertions, 340 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 026771a9b097..394838f489eb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -183,7 +183,7 @@ config MEMORY_HOTPLUG_SPARSE | |||
183 | config MEMORY_HOTREMOVE | 183 | config MEMORY_HOTREMOVE |
184 | bool "Allow for memory hot remove" | 184 | bool "Allow for memory hot remove" |
185 | select MEMORY_ISOLATION | 185 | select MEMORY_ISOLATION |
186 | select HAVE_BOOTMEM_INFO_NODE if X86_64 | 186 | select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64) |
187 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE | 187 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE |
188 | depends on MIGRATION | 188 | depends on MIGRATION |
189 | 189 | ||
diff --git a/mm/bounce.c b/mm/bounce.c index c9f0a4339a7d..5a7d58fb883b 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
@@ -204,6 +204,8 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, | |||
204 | struct bio_vec *to, *from; | 204 | struct bio_vec *to, *from; |
205 | unsigned i; | 205 | unsigned i; |
206 | 206 | ||
207 | if (force) | ||
208 | goto bounce; | ||
207 | bio_for_each_segment(from, *bio_orig, i) | 209 | bio_for_each_segment(from, *bio_orig, i) |
208 | if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) | 210 | if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) |
209 | goto bounce; | 211 | goto bounce; |
diff --git a/mm/compaction.c b/mm/compaction.c index c43789388cd8..b5326b141a25 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -677,6 +677,13 @@ static void isolate_freepages(struct zone *zone, | |||
677 | pfn -= pageblock_nr_pages) { | 677 | pfn -= pageblock_nr_pages) { |
678 | unsigned long isolated; | 678 | unsigned long isolated; |
679 | 679 | ||
680 | /* | ||
681 | * This can iterate a massively long zone without finding any | ||
682 | * suitable migration targets, so periodically check if we need | ||
683 | * to schedule. | ||
684 | */ | ||
685 | cond_resched(); | ||
686 | |||
680 | if (!pfn_valid(pfn)) | 687 | if (!pfn_valid(pfn)) |
681 | continue; | 688 | continue; |
682 | 689 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 1e6aec4a2d2e..ae4846ff4849 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1616 | struct inode *inode = mapping->host; | 1616 | struct inode *inode = mapping->host; |
1617 | pgoff_t offset = vmf->pgoff; | 1617 | pgoff_t offset = vmf->pgoff; |
1618 | struct page *page; | 1618 | struct page *page; |
1619 | bool memcg_oom; | ||
1620 | pgoff_t size; | 1619 | pgoff_t size; |
1621 | int ret = 0; | 1620 | int ret = 0; |
1622 | 1621 | ||
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1625 | return VM_FAULT_SIGBUS; | 1624 | return VM_FAULT_SIGBUS; |
1626 | 1625 | ||
1627 | /* | 1626 | /* |
1628 | * Do we have something in the page cache already? Either | 1627 | * Do we have something in the page cache already? |
1629 | * way, try readahead, but disable the memcg OOM killer for it | ||
1630 | * as readahead is optional and no errors are propagated up | ||
1631 | * the fault stack. The OOM killer is enabled while trying to | ||
1632 | * instantiate the faulting page individually below. | ||
1633 | */ | 1628 | */ |
1634 | page = find_get_page(mapping, offset); | 1629 | page = find_get_page(mapping, offset); |
1635 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { | 1630 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { |
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1637 | * We found the page, so try async readahead before | 1632 | * We found the page, so try async readahead before |
1638 | * waiting for the lock. | 1633 | * waiting for the lock. |
1639 | */ | 1634 | */ |
1640 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
1641 | do_async_mmap_readahead(vma, ra, file, page, offset); | 1635 | do_async_mmap_readahead(vma, ra, file, page, offset); |
1642 | mem_cgroup_toggle_oom(memcg_oom); | ||
1643 | } else if (!page) { | 1636 | } else if (!page) { |
1644 | /* No page in the page cache at all */ | 1637 | /* No page in the page cache at all */ |
1645 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
1646 | do_sync_mmap_readahead(vma, ra, file, offset); | 1638 | do_sync_mmap_readahead(vma, ra, file, offset); |
1647 | mem_cgroup_toggle_oom(memcg_oom); | ||
1648 | count_vm_event(PGMAJFAULT); | 1639 | count_vm_event(PGMAJFAULT); |
1649 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | 1640 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
1650 | ret = VM_FAULT_MAJOR; | 1641 | ret = VM_FAULT_MAJOR; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7489884682d8..610e3df2768a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -2697,6 +2697,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | |||
2697 | 2697 | ||
2698 | mmun_start = haddr; | 2698 | mmun_start = haddr; |
2699 | mmun_end = haddr + HPAGE_PMD_SIZE; | 2699 | mmun_end = haddr + HPAGE_PMD_SIZE; |
2700 | again: | ||
2700 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2701 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2701 | spin_lock(&mm->page_table_lock); | 2702 | spin_lock(&mm->page_table_lock); |
2702 | if (unlikely(!pmd_trans_huge(*pmd))) { | 2703 | if (unlikely(!pmd_trans_huge(*pmd))) { |
@@ -2719,7 +2720,14 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | |||
2719 | split_huge_page(page); | 2720 | split_huge_page(page); |
2720 | 2721 | ||
2721 | put_page(page); | 2722 | put_page(page); |
2722 | BUG_ON(pmd_trans_huge(*pmd)); | 2723 | |
2724 | /* | ||
2725 | * We don't always have down_write of mmap_sem here: a racing | ||
2726 | * do_huge_pmd_wp_page() might have copied-on-write to another | ||
2727 | * huge page before our split_huge_page() got the anon_vma lock. | ||
2728 | */ | ||
2729 | if (unlikely(pmd_trans_huge(*pmd))) | ||
2730 | goto again; | ||
2723 | } | 2731 | } |
2724 | 2732 | ||
2725 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | 2733 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b49579c7f2a5..0b7656e804d1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -653,6 +653,7 @@ static void free_huge_page(struct page *page) | |||
653 | BUG_ON(page_count(page)); | 653 | BUG_ON(page_count(page)); |
654 | BUG_ON(page_mapcount(page)); | 654 | BUG_ON(page_mapcount(page)); |
655 | restore_reserve = PagePrivate(page); | 655 | restore_reserve = PagePrivate(page); |
656 | ClearPagePrivate(page); | ||
656 | 657 | ||
657 | spin_lock(&hugetlb_lock); | 658 | spin_lock(&hugetlb_lock); |
658 | hugetlb_cgroup_uncharge_page(hstate_index(h), | 659 | hugetlb_cgroup_uncharge_page(hstate_index(h), |
@@ -695,8 +696,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
695 | /* we rely on prep_new_huge_page to set the destructor */ | 696 | /* we rely on prep_new_huge_page to set the destructor */ |
696 | set_compound_order(page, order); | 697 | set_compound_order(page, order); |
697 | __SetPageHead(page); | 698 | __SetPageHead(page); |
699 | __ClearPageReserved(page); | ||
698 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | 700 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { |
699 | __SetPageTail(p); | 701 | __SetPageTail(p); |
702 | /* | ||
703 | * For gigantic hugepages allocated through bootmem at | ||
704 | * boot, it's safer to be consistent with the not-gigantic | ||
705 | * hugepages and clear the PG_reserved bit from all tail pages | ||
706 | * too. Otherwse drivers using get_user_pages() to access tail | ||
707 | * pages may get the reference counting wrong if they see | ||
708 | * PG_reserved set on a tail page (despite the head page not | ||
709 | * having PG_reserved set). Enforcing this consistency between | ||
710 | * head and tail pages allows drivers to optimize away a check | ||
711 | * on the head page when they need know if put_page() is needed | ||
712 | * after get_user_pages(). | ||
713 | */ | ||
714 | __ClearPageReserved(p); | ||
700 | set_page_count(p, 0); | 715 | set_page_count(p, 0); |
701 | p->first_page = page; | 716 | p->first_page = page; |
702 | } | 717 | } |
@@ -1329,9 +1344,9 @@ static void __init gather_bootmem_prealloc(void) | |||
1329 | #else | 1344 | #else |
1330 | page = virt_to_page(m); | 1345 | page = virt_to_page(m); |
1331 | #endif | 1346 | #endif |
1332 | __ClearPageReserved(page); | ||
1333 | WARN_ON(page_count(page) != 1); | 1347 | WARN_ON(page_count(page) != 1); |
1334 | prep_compound_huge_page(page, h->order); | 1348 | prep_compound_huge_page(page, h->order); |
1349 | WARN_ON(PageReserved(page)); | ||
1335 | prep_new_huge_page(h, page, page_to_nid(page)); | 1350 | prep_new_huge_page(h, page, page_to_nid(page)); |
1336 | /* | 1351 | /* |
1337 | * If we had gigantic hugepages allocated at boot time, we need | 1352 | * If we had gigantic hugepages allocated at boot time, we need |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index afc2daa91c60..4c84678371eb 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -20,8 +20,6 @@ static int hwpoison_inject(void *data, u64 val) | |||
20 | if (!capable(CAP_SYS_ADMIN)) | 20 | if (!capable(CAP_SYS_ADMIN)) |
21 | return -EPERM; | 21 | return -EPERM; |
22 | 22 | ||
23 | if (!hwpoison_filter_enable) | ||
24 | goto inject; | ||
25 | if (!pfn_valid(pfn)) | 23 | if (!pfn_valid(pfn)) |
26 | return -ENXIO; | 24 | return -ENXIO; |
27 | 25 | ||
@@ -33,6 +31,9 @@ static int hwpoison_inject(void *data, u64 val) | |||
33 | if (!get_page_unless_zero(hpage)) | 31 | if (!get_page_unless_zero(hpage)) |
34 | return 0; | 32 | return 0; |
35 | 33 | ||
34 | if (!hwpoison_filter_enable) | ||
35 | goto inject; | ||
36 | |||
36 | if (!PageLRU(p) && !PageHuge(p)) | 37 | if (!PageLRU(p) && !PageHuge(p)) |
37 | shake_page(p, 0); | 38 | shake_page(p, 0); |
38 | /* | 39 | /* |
diff --git a/mm/madvise.c b/mm/madvise.c index 6975bc812542..539eeb96b323 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -343,10 +343,11 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
343 | */ | 343 | */ |
344 | static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) | 344 | static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) |
345 | { | 345 | { |
346 | struct page *p; | ||
346 | if (!capable(CAP_SYS_ADMIN)) | 347 | if (!capable(CAP_SYS_ADMIN)) |
347 | return -EPERM; | 348 | return -EPERM; |
348 | for (; start < end; start += PAGE_SIZE) { | 349 | for (; start < end; start += PAGE_SIZE << |
349 | struct page *p; | 350 | compound_order(compound_head(p))) { |
350 | int ret; | 351 | int ret; |
351 | 352 | ||
352 | ret = get_user_pages_fast(start, 1, 0, &p); | 353 | ret = get_user_pages_fast(start, 1, 0, &p); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5ff3ce13029..34d3ca9572d6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/limits.h> | 39 | #include <linux/limits.h> |
40 | #include <linux/export.h> | 40 | #include <linux/export.h> |
41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/rbtree.h> | ||
42 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
43 | #include <linux/swap.h> | 44 | #include <linux/swap.h> |
44 | #include <linux/swapops.h> | 45 | #include <linux/swapops.h> |
@@ -160,6 +161,10 @@ struct mem_cgroup_per_zone { | |||
160 | 161 | ||
161 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 162 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
162 | 163 | ||
164 | struct rb_node tree_node; /* RB tree node */ | ||
165 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
166 | /* the soft limit is exceeded*/ | ||
167 | bool on_tree; | ||
163 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ | 168 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
164 | /* use container_of */ | 169 | /* use container_of */ |
165 | }; | 170 | }; |
@@ -168,6 +173,26 @@ struct mem_cgroup_per_node { | |||
168 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 173 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
169 | }; | 174 | }; |
170 | 175 | ||
176 | /* | ||
177 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
178 | * their hierarchy representation | ||
179 | */ | ||
180 | |||
181 | struct mem_cgroup_tree_per_zone { | ||
182 | struct rb_root rb_root; | ||
183 | spinlock_t lock; | ||
184 | }; | ||
185 | |||
186 | struct mem_cgroup_tree_per_node { | ||
187 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
188 | }; | ||
189 | |||
190 | struct mem_cgroup_tree { | ||
191 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
192 | }; | ||
193 | |||
194 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
195 | |||
171 | struct mem_cgroup_threshold { | 196 | struct mem_cgroup_threshold { |
172 | struct eventfd_ctx *eventfd; | 197 | struct eventfd_ctx *eventfd; |
173 | u64 threshold; | 198 | u64 threshold; |
@@ -303,22 +328,6 @@ struct mem_cgroup { | |||
303 | atomic_t numainfo_events; | 328 | atomic_t numainfo_events; |
304 | atomic_t numainfo_updating; | 329 | atomic_t numainfo_updating; |
305 | #endif | 330 | #endif |
306 | /* | ||
307 | * Protects soft_contributed transitions. | ||
308 | * See mem_cgroup_update_soft_limit | ||
309 | */ | ||
310 | spinlock_t soft_lock; | ||
311 | |||
312 | /* | ||
313 | * If true then this group has increased parents' children_in_excess | ||
314 | * when it got over the soft limit. | ||
315 | * When a group falls bellow the soft limit, parents' children_in_excess | ||
316 | * is decreased and soft_contributed changed to false. | ||
317 | */ | ||
318 | bool soft_contributed; | ||
319 | |||
320 | /* Number of children that are in soft limit excess */ | ||
321 | atomic_t children_in_excess; | ||
322 | 331 | ||
323 | struct mem_cgroup_per_node *nodeinfo[0]; | 332 | struct mem_cgroup_per_node *nodeinfo[0]; |
324 | /* WARNING: nodeinfo must be the last member here */ | 333 | /* WARNING: nodeinfo must be the last member here */ |
@@ -422,6 +431,7 @@ static bool move_file(void) | |||
422 | * limit reclaim to prevent infinite loops, if they ever occur. | 431 | * limit reclaim to prevent infinite loops, if they ever occur. |
423 | */ | 432 | */ |
424 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 | 433 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
434 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 | ||
425 | 435 | ||
426 | enum charge_type { | 436 | enum charge_type { |
427 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 437 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
@@ -648,6 +658,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) | |||
648 | return mem_cgroup_zoneinfo(memcg, nid, zid); | 658 | return mem_cgroup_zoneinfo(memcg, nid, zid); |
649 | } | 659 | } |
650 | 660 | ||
661 | static struct mem_cgroup_tree_per_zone * | ||
662 | soft_limit_tree_node_zone(int nid, int zid) | ||
663 | { | ||
664 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
665 | } | ||
666 | |||
667 | static struct mem_cgroup_tree_per_zone * | ||
668 | soft_limit_tree_from_page(struct page *page) | ||
669 | { | ||
670 | int nid = page_to_nid(page); | ||
671 | int zid = page_zonenum(page); | ||
672 | |||
673 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
674 | } | ||
675 | |||
676 | static void | ||
677 | __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, | ||
678 | struct mem_cgroup_per_zone *mz, | ||
679 | struct mem_cgroup_tree_per_zone *mctz, | ||
680 | unsigned long long new_usage_in_excess) | ||
681 | { | ||
682 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
683 | struct rb_node *parent = NULL; | ||
684 | struct mem_cgroup_per_zone *mz_node; | ||
685 | |||
686 | if (mz->on_tree) | ||
687 | return; | ||
688 | |||
689 | mz->usage_in_excess = new_usage_in_excess; | ||
690 | if (!mz->usage_in_excess) | ||
691 | return; | ||
692 | while (*p) { | ||
693 | parent = *p; | ||
694 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
695 | tree_node); | ||
696 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
697 | p = &(*p)->rb_left; | ||
698 | /* | ||
699 | * We can't avoid mem cgroups that are over their soft | ||
700 | * limit by the same amount | ||
701 | */ | ||
702 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
703 | p = &(*p)->rb_right; | ||
704 | } | ||
705 | rb_link_node(&mz->tree_node, parent, p); | ||
706 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
707 | mz->on_tree = true; | ||
708 | } | ||
709 | |||
710 | static void | ||
711 | __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
712 | struct mem_cgroup_per_zone *mz, | ||
713 | struct mem_cgroup_tree_per_zone *mctz) | ||
714 | { | ||
715 | if (!mz->on_tree) | ||
716 | return; | ||
717 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
718 | mz->on_tree = false; | ||
719 | } | ||
720 | |||
721 | static void | ||
722 | mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
723 | struct mem_cgroup_per_zone *mz, | ||
724 | struct mem_cgroup_tree_per_zone *mctz) | ||
725 | { | ||
726 | spin_lock(&mctz->lock); | ||
727 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
728 | spin_unlock(&mctz->lock); | ||
729 | } | ||
730 | |||
731 | |||
732 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | ||
733 | { | ||
734 | unsigned long long excess; | ||
735 | struct mem_cgroup_per_zone *mz; | ||
736 | struct mem_cgroup_tree_per_zone *mctz; | ||
737 | int nid = page_to_nid(page); | ||
738 | int zid = page_zonenum(page); | ||
739 | mctz = soft_limit_tree_from_page(page); | ||
740 | |||
741 | /* | ||
742 | * Necessary to update all ancestors when hierarchy is used. | ||
743 | * because their event counter is not touched. | ||
744 | */ | ||
745 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | ||
746 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
747 | excess = res_counter_soft_limit_excess(&memcg->res); | ||
748 | /* | ||
749 | * We have to update the tree if mz is on RB-tree or | ||
750 | * mem is over its softlimit. | ||
751 | */ | ||
752 | if (excess || mz->on_tree) { | ||
753 | spin_lock(&mctz->lock); | ||
754 | /* if on-tree, remove it */ | ||
755 | if (mz->on_tree) | ||
756 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
757 | /* | ||
758 | * Insert again. mz->usage_in_excess will be updated. | ||
759 | * If excess is 0, no tree ops. | ||
760 | */ | ||
761 | __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); | ||
762 | spin_unlock(&mctz->lock); | ||
763 | } | ||
764 | } | ||
765 | } | ||
766 | |||
767 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | ||
768 | { | ||
769 | int node, zone; | ||
770 | struct mem_cgroup_per_zone *mz; | ||
771 | struct mem_cgroup_tree_per_zone *mctz; | ||
772 | |||
773 | for_each_node(node) { | ||
774 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
775 | mz = mem_cgroup_zoneinfo(memcg, node, zone); | ||
776 | mctz = soft_limit_tree_node_zone(node, zone); | ||
777 | mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
778 | } | ||
779 | } | ||
780 | } | ||
781 | |||
782 | static struct mem_cgroup_per_zone * | ||
783 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
784 | { | ||
785 | struct rb_node *rightmost = NULL; | ||
786 | struct mem_cgroup_per_zone *mz; | ||
787 | |||
788 | retry: | ||
789 | mz = NULL; | ||
790 | rightmost = rb_last(&mctz->rb_root); | ||
791 | if (!rightmost) | ||
792 | goto done; /* Nothing to reclaim from */ | ||
793 | |||
794 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
795 | /* | ||
796 | * Remove the node now but someone else can add it back, | ||
797 | * we will to add it back at the end of reclaim to its correct | ||
798 | * position in the tree. | ||
799 | */ | ||
800 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
801 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || | ||
802 | !css_tryget(&mz->memcg->css)) | ||
803 | goto retry; | ||
804 | done: | ||
805 | return mz; | ||
806 | } | ||
807 | |||
808 | static struct mem_cgroup_per_zone * | ||
809 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
810 | { | ||
811 | struct mem_cgroup_per_zone *mz; | ||
812 | |||
813 | spin_lock(&mctz->lock); | ||
814 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
815 | spin_unlock(&mctz->lock); | ||
816 | return mz; | ||
817 | } | ||
818 | |||
651 | /* | 819 | /* |
652 | * Implementation Note: reading percpu statistics for memcg. | 820 | * Implementation Note: reading percpu statistics for memcg. |
653 | * | 821 | * |
@@ -698,6 +866,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
698 | unsigned long val = 0; | 866 | unsigned long val = 0; |
699 | int cpu; | 867 | int cpu; |
700 | 868 | ||
869 | get_online_cpus(); | ||
701 | for_each_online_cpu(cpu) | 870 | for_each_online_cpu(cpu) |
702 | val += per_cpu(memcg->stat->events[idx], cpu); | 871 | val += per_cpu(memcg->stat->events[idx], cpu); |
703 | #ifdef CONFIG_HOTPLUG_CPU | 872 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -705,6 +874,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
705 | val += memcg->nocpu_base.events[idx]; | 874 | val += memcg->nocpu_base.events[idx]; |
706 | spin_unlock(&memcg->pcp_counter_lock); | 875 | spin_unlock(&memcg->pcp_counter_lock); |
707 | #endif | 876 | #endif |
877 | put_online_cpus(); | ||
708 | return val; | 878 | return val; |
709 | } | 879 | } |
710 | 880 | ||
@@ -822,48 +992,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | |||
822 | } | 992 | } |
823 | 993 | ||
824 | /* | 994 | /* |
825 | * Called from rate-limited memcg_check_events when enough | ||
826 | * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure | ||
827 | * that all the parents up the hierarchy will be notified that this group | ||
828 | * is in excess or that it is not in excess anymore. mmecg->soft_contributed | ||
829 | * makes the transition a single action whenever the state flips from one to | ||
830 | * the other. | ||
831 | */ | ||
832 | static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) | ||
833 | { | ||
834 | unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); | ||
835 | struct mem_cgroup *parent = memcg; | ||
836 | int delta = 0; | ||
837 | |||
838 | spin_lock(&memcg->soft_lock); | ||
839 | if (excess) { | ||
840 | if (!memcg->soft_contributed) { | ||
841 | delta = 1; | ||
842 | memcg->soft_contributed = true; | ||
843 | } | ||
844 | } else { | ||
845 | if (memcg->soft_contributed) { | ||
846 | delta = -1; | ||
847 | memcg->soft_contributed = false; | ||
848 | } | ||
849 | } | ||
850 | |||
851 | /* | ||
852 | * Necessary to update all ancestors when hierarchy is used | ||
853 | * because their event counter is not touched. | ||
854 | * We track children even outside the hierarchy for the root | ||
855 | * cgroup because tree walk starting at root should visit | ||
856 | * all cgroups and we want to prevent from pointless tree | ||
857 | * walk if no children is below the limit. | ||
858 | */ | ||
859 | while (delta && (parent = parent_mem_cgroup(parent))) | ||
860 | atomic_add(delta, &parent->children_in_excess); | ||
861 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
862 | atomic_add(delta, &root_mem_cgroup->children_in_excess); | ||
863 | spin_unlock(&memcg->soft_lock); | ||
864 | } | ||
865 | |||
866 | /* | ||
867 | * Check events in order. | 995 | * Check events in order. |
868 | * | 996 | * |
869 | */ | 997 | */ |
@@ -886,7 +1014,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
886 | 1014 | ||
887 | mem_cgroup_threshold(memcg); | 1015 | mem_cgroup_threshold(memcg); |
888 | if (unlikely(do_softlimit)) | 1016 | if (unlikely(do_softlimit)) |
889 | mem_cgroup_update_soft_limit(memcg); | 1017 | mem_cgroup_update_tree(memcg, page); |
890 | #if MAX_NUMNODES > 1 | 1018 | #if MAX_NUMNODES > 1 |
891 | if (unlikely(do_numainfo)) | 1019 | if (unlikely(do_numainfo)) |
892 | atomic_inc(&memcg->numainfo_events); | 1020 | atomic_inc(&memcg->numainfo_events); |
@@ -929,15 +1057,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
929 | return memcg; | 1057 | return memcg; |
930 | } | 1058 | } |
931 | 1059 | ||
932 | static enum mem_cgroup_filter_t | ||
933 | mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, | ||
934 | mem_cgroup_iter_filter cond) | ||
935 | { | ||
936 | if (!cond) | ||
937 | return VISIT; | ||
938 | return cond(memcg, root); | ||
939 | } | ||
940 | |||
941 | /* | 1060 | /* |
942 | * Returns a next (in a pre-order walk) alive memcg (with elevated css | 1061 | * Returns a next (in a pre-order walk) alive memcg (with elevated css |
943 | * ref. count) or NULL if the whole root's subtree has been visited. | 1062 | * ref. count) or NULL if the whole root's subtree has been visited. |
@@ -945,7 +1064,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, | |||
945 | * helper function to be used by mem_cgroup_iter | 1064 | * helper function to be used by mem_cgroup_iter |
946 | */ | 1065 | */ |
947 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, | 1066 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, |
948 | struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) | 1067 | struct mem_cgroup *last_visited) |
949 | { | 1068 | { |
950 | struct cgroup_subsys_state *prev_css, *next_css; | 1069 | struct cgroup_subsys_state *prev_css, *next_css; |
951 | 1070 | ||
@@ -963,31 +1082,11 @@ skip_node: | |||
963 | if (next_css) { | 1082 | if (next_css) { |
964 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); | 1083 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); |
965 | 1084 | ||
966 | switch (mem_cgroup_filter(mem, root, cond)) { | 1085 | if (css_tryget(&mem->css)) |
967 | case SKIP: | 1086 | return mem; |
1087 | else { | ||
968 | prev_css = next_css; | 1088 | prev_css = next_css; |
969 | goto skip_node; | 1089 | goto skip_node; |
970 | case SKIP_TREE: | ||
971 | if (mem == root) | ||
972 | return NULL; | ||
973 | /* | ||
974 | * css_rightmost_descendant is not an optimal way to | ||
975 | * skip through a subtree (especially for imbalanced | ||
976 | * trees leaning to right) but that's what we have right | ||
977 | * now. More effective solution would be traversing | ||
978 | * right-up for first non-NULL without calling | ||
979 | * css_next_descendant_pre afterwards. | ||
980 | */ | ||
981 | prev_css = css_rightmost_descendant(next_css); | ||
982 | goto skip_node; | ||
983 | case VISIT: | ||
984 | if (css_tryget(&mem->css)) | ||
985 | return mem; | ||
986 | else { | ||
987 | prev_css = next_css; | ||
988 | goto skip_node; | ||
989 | } | ||
990 | break; | ||
991 | } | 1090 | } |
992 | } | 1091 | } |
993 | 1092 | ||
@@ -1051,7 +1150,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
1051 | * @root: hierarchy root | 1150 | * @root: hierarchy root |
1052 | * @prev: previously returned memcg, NULL on first invocation | 1151 | * @prev: previously returned memcg, NULL on first invocation |
1053 | * @reclaim: cookie for shared reclaim walks, NULL for full walks | 1152 | * @reclaim: cookie for shared reclaim walks, NULL for full walks |
1054 | * @cond: filter for visited nodes, NULL for no filter | ||
1055 | * | 1153 | * |
1056 | * Returns references to children of the hierarchy below @root, or | 1154 | * Returns references to children of the hierarchy below @root, or |
1057 | * @root itself, or %NULL after a full round-trip. | 1155 | * @root itself, or %NULL after a full round-trip. |
@@ -1064,18 +1162,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
1064 | * divide up the memcgs in the hierarchy among all concurrent | 1162 | * divide up the memcgs in the hierarchy among all concurrent |
1065 | * reclaimers operating on the same zone and priority. | 1163 | * reclaimers operating on the same zone and priority. |
1066 | */ | 1164 | */ |
1067 | struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | 1165 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, |
1068 | struct mem_cgroup *prev, | 1166 | struct mem_cgroup *prev, |
1069 | struct mem_cgroup_reclaim_cookie *reclaim, | 1167 | struct mem_cgroup_reclaim_cookie *reclaim) |
1070 | mem_cgroup_iter_filter cond) | ||
1071 | { | 1168 | { |
1072 | struct mem_cgroup *memcg = NULL; | 1169 | struct mem_cgroup *memcg = NULL; |
1073 | struct mem_cgroup *last_visited = NULL; | 1170 | struct mem_cgroup *last_visited = NULL; |
1074 | 1171 | ||
1075 | if (mem_cgroup_disabled()) { | 1172 | if (mem_cgroup_disabled()) |
1076 | /* first call must return non-NULL, second return NULL */ | 1173 | return NULL; |
1077 | return (struct mem_cgroup *)(unsigned long)!prev; | ||
1078 | } | ||
1079 | 1174 | ||
1080 | if (!root) | 1175 | if (!root) |
1081 | root = root_mem_cgroup; | 1176 | root = root_mem_cgroup; |
@@ -1086,9 +1181,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
1086 | if (!root->use_hierarchy && root != root_mem_cgroup) { | 1181 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
1087 | if (prev) | 1182 | if (prev) |
1088 | goto out_css_put; | 1183 | goto out_css_put; |
1089 | if (mem_cgroup_filter(root, root, cond) == VISIT) | 1184 | return root; |
1090 | return root; | ||
1091 | return NULL; | ||
1092 | } | 1185 | } |
1093 | 1186 | ||
1094 | rcu_read_lock(); | 1187 | rcu_read_lock(); |
@@ -1111,7 +1204,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
1111 | last_visited = mem_cgroup_iter_load(iter, root, &seq); | 1204 | last_visited = mem_cgroup_iter_load(iter, root, &seq); |
1112 | } | 1205 | } |
1113 | 1206 | ||
1114 | memcg = __mem_cgroup_iter_next(root, last_visited, cond); | 1207 | memcg = __mem_cgroup_iter_next(root, last_visited); |
1115 | 1208 | ||
1116 | if (reclaim) { | 1209 | if (reclaim) { |
1117 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); | 1210 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); |
@@ -1122,11 +1215,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, | |||
1122 | reclaim->generation = iter->generation; | 1215 | reclaim->generation = iter->generation; |
1123 | } | 1216 | } |
1124 | 1217 | ||
1125 | /* | 1218 | if (prev && !memcg) |
1126 | * We have finished the whole tree walk or no group has been | ||
1127 | * visited because filter told us to skip the root node. | ||
1128 | */ | ||
1129 | if (!memcg && (prev || (cond && !last_visited))) | ||
1130 | goto out_unlock; | 1219 | goto out_unlock; |
1131 | } | 1220 | } |
1132 | out_unlock: | 1221 | out_unlock: |
@@ -1767,7 +1856,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
1767 | return total; | 1856 | return total; |
1768 | } | 1857 | } |
1769 | 1858 | ||
1770 | #if MAX_NUMNODES > 1 | ||
1771 | /** | 1859 | /** |
1772 | * test_mem_cgroup_node_reclaimable | 1860 | * test_mem_cgroup_node_reclaimable |
1773 | * @memcg: the target memcg | 1861 | * @memcg: the target memcg |
@@ -1790,6 +1878,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | |||
1790 | return false; | 1878 | return false; |
1791 | 1879 | ||
1792 | } | 1880 | } |
1881 | #if MAX_NUMNODES > 1 | ||
1793 | 1882 | ||
1794 | /* | 1883 | /* |
1795 | * Always updating the nodemask is not very good - even if we have an empty | 1884 | * Always updating the nodemask is not very good - even if we have an empty |
@@ -1857,50 +1946,104 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1857 | return node; | 1946 | return node; |
1858 | } | 1947 | } |
1859 | 1948 | ||
1949 | /* | ||
1950 | * Check all nodes whether it contains reclaimable pages or not. | ||
1951 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
1952 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
1953 | * enough new information. We need to do double check. | ||
1954 | */ | ||
1955 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
1956 | { | ||
1957 | int nid; | ||
1958 | |||
1959 | /* | ||
1960 | * quick check...making use of scan_node. | ||
1961 | * We can skip unused nodes. | ||
1962 | */ | ||
1963 | if (!nodes_empty(memcg->scan_nodes)) { | ||
1964 | for (nid = first_node(memcg->scan_nodes); | ||
1965 | nid < MAX_NUMNODES; | ||
1966 | nid = next_node(nid, memcg->scan_nodes)) { | ||
1967 | |||
1968 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1969 | return true; | ||
1970 | } | ||
1971 | } | ||
1972 | /* | ||
1973 | * Check rest of nodes. | ||
1974 | */ | ||
1975 | for_each_node_state(nid, N_MEMORY) { | ||
1976 | if (node_isset(nid, memcg->scan_nodes)) | ||
1977 | continue; | ||
1978 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1979 | return true; | ||
1980 | } | ||
1981 | return false; | ||
1982 | } | ||
1983 | |||
1860 | #else | 1984 | #else |
1861 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 1985 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1862 | { | 1986 | { |
1863 | return 0; | 1987 | return 0; |
1864 | } | 1988 | } |
1865 | 1989 | ||
1866 | #endif | 1990 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1867 | |||
1868 | /* | ||
1869 | * A group is eligible for the soft limit reclaim under the given root | ||
1870 | * hierarchy if | ||
1871 | * a) it is over its soft limit | ||
1872 | * b) any parent up the hierarchy is over its soft limit | ||
1873 | * | ||
1874 | * If the given group doesn't have any children over the limit then it | ||
1875 | * doesn't make any sense to iterate its subtree. | ||
1876 | */ | ||
1877 | enum mem_cgroup_filter_t | ||
1878 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, | ||
1879 | struct mem_cgroup *root) | ||
1880 | { | 1991 | { |
1881 | struct mem_cgroup *parent; | 1992 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); |
1882 | 1993 | } | |
1883 | if (!memcg) | 1994 | #endif |
1884 | memcg = root_mem_cgroup; | ||
1885 | parent = memcg; | ||
1886 | |||
1887 | if (res_counter_soft_limit_excess(&memcg->res)) | ||
1888 | return VISIT; | ||
1889 | 1995 | ||
1890 | /* | 1996 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
1891 | * If any parent up to the root in the hierarchy is over its soft limit | 1997 | struct zone *zone, |
1892 | * then we have to obey and reclaim from this group as well. | 1998 | gfp_t gfp_mask, |
1893 | */ | 1999 | unsigned long *total_scanned) |
1894 | while ((parent = parent_mem_cgroup(parent))) { | 2000 | { |
1895 | if (res_counter_soft_limit_excess(&parent->res)) | 2001 | struct mem_cgroup *victim = NULL; |
1896 | return VISIT; | 2002 | int total = 0; |
1897 | if (parent == root) | 2003 | int loop = 0; |
2004 | unsigned long excess; | ||
2005 | unsigned long nr_scanned; | ||
2006 | struct mem_cgroup_reclaim_cookie reclaim = { | ||
2007 | .zone = zone, | ||
2008 | .priority = 0, | ||
2009 | }; | ||
2010 | |||
2011 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; | ||
2012 | |||
2013 | while (1) { | ||
2014 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); | ||
2015 | if (!victim) { | ||
2016 | loop++; | ||
2017 | if (loop >= 2) { | ||
2018 | /* | ||
2019 | * If we have not been able to reclaim | ||
2020 | * anything, it might because there are | ||
2021 | * no reclaimable pages under this hierarchy | ||
2022 | */ | ||
2023 | if (!total) | ||
2024 | break; | ||
2025 | /* | ||
2026 | * We want to do more targeted reclaim. | ||
2027 | * excess >> 2 is not to excessive so as to | ||
2028 | * reclaim too much, nor too less that we keep | ||
2029 | * coming back to reclaim from this cgroup | ||
2030 | */ | ||
2031 | if (total >= (excess >> 2) || | ||
2032 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) | ||
2033 | break; | ||
2034 | } | ||
2035 | continue; | ||
2036 | } | ||
2037 | if (!mem_cgroup_reclaimable(victim, false)) | ||
2038 | continue; | ||
2039 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, | ||
2040 | zone, &nr_scanned); | ||
2041 | *total_scanned += nr_scanned; | ||
2042 | if (!res_counter_soft_limit_excess(&root_memcg->res)) | ||
1898 | break; | 2043 | break; |
1899 | } | 2044 | } |
1900 | 2045 | mem_cgroup_iter_break(root_memcg, victim); | |
1901 | if (!atomic_read(&memcg->children_in_excess)) | 2046 | return total; |
1902 | return SKIP_TREE; | ||
1903 | return SKIP; | ||
1904 | } | 2047 | } |
1905 | 2048 | ||
1906 | static DEFINE_SPINLOCK(memcg_oom_lock); | 2049 | static DEFINE_SPINLOCK(memcg_oom_lock); |
@@ -2018,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
2018 | memcg_wakeup_oom(memcg); | 2161 | memcg_wakeup_oom(memcg); |
2019 | } | 2162 | } |
2020 | 2163 | ||
2021 | /* | ||
2022 | * try to call OOM killer | ||
2023 | */ | ||
2024 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | 2164 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
2025 | { | 2165 | { |
2026 | bool locked; | ||
2027 | int wakeups; | ||
2028 | |||
2029 | if (!current->memcg_oom.may_oom) | 2166 | if (!current->memcg_oom.may_oom) |
2030 | return; | 2167 | return; |
2031 | |||
2032 | current->memcg_oom.in_memcg_oom = 1; | ||
2033 | |||
2034 | /* | 2168 | /* |
2035 | * As with any blocking lock, a contender needs to start | 2169 | * We are in the middle of the charge context here, so we |
2036 | * listening for wakeups before attempting the trylock, | 2170 | * don't want to block when potentially sitting on a callstack |
2037 | * otherwise it can miss the wakeup from the unlock and sleep | 2171 | * that holds all kinds of filesystem and mm locks. |
2038 | * indefinitely. This is just open-coded because our locking | 2172 | * |
2039 | * is so particular to memcg hierarchies. | 2173 | * Also, the caller may handle a failed allocation gracefully |
2174 | * (like optional page cache readahead) and so an OOM killer | ||
2175 | * invocation might not even be necessary. | ||
2176 | * | ||
2177 | * That's why we don't do anything here except remember the | ||
2178 | * OOM context and then deal with it at the end of the page | ||
2179 | * fault when the stack is unwound, the locks are released, | ||
2180 | * and when we know whether the fault was overall successful. | ||
2040 | */ | 2181 | */ |
2041 | wakeups = atomic_read(&memcg->oom_wakeups); | 2182 | css_get(&memcg->css); |
2042 | mem_cgroup_mark_under_oom(memcg); | 2183 | current->memcg_oom.memcg = memcg; |
2043 | 2184 | current->memcg_oom.gfp_mask = mask; | |
2044 | locked = mem_cgroup_oom_trylock(memcg); | 2185 | current->memcg_oom.order = order; |
2045 | |||
2046 | if (locked) | ||
2047 | mem_cgroup_oom_notify(memcg); | ||
2048 | |||
2049 | if (locked && !memcg->oom_kill_disable) { | ||
2050 | mem_cgroup_unmark_under_oom(memcg); | ||
2051 | mem_cgroup_out_of_memory(memcg, mask, order); | ||
2052 | mem_cgroup_oom_unlock(memcg); | ||
2053 | /* | ||
2054 | * There is no guarantee that an OOM-lock contender | ||
2055 | * sees the wakeups triggered by the OOM kill | ||
2056 | * uncharges. Wake any sleepers explicitely. | ||
2057 | */ | ||
2058 | memcg_oom_recover(memcg); | ||
2059 | } else { | ||
2060 | /* | ||
2061 | * A system call can just return -ENOMEM, but if this | ||
2062 | * is a page fault and somebody else is handling the | ||
2063 | * OOM already, we need to sleep on the OOM waitqueue | ||
2064 | * for this memcg until the situation is resolved. | ||
2065 | * Which can take some time because it might be | ||
2066 | * handled by a userspace task. | ||
2067 | * | ||
2068 | * However, this is the charge context, which means | ||
2069 | * that we may sit on a large call stack and hold | ||
2070 | * various filesystem locks, the mmap_sem etc. and we | ||
2071 | * don't want the OOM handler to deadlock on them | ||
2072 | * while we sit here and wait. Store the current OOM | ||
2073 | * context in the task_struct, then return -ENOMEM. | ||
2074 | * At the end of the page fault handler, with the | ||
2075 | * stack unwound, pagefault_out_of_memory() will check | ||
2076 | * back with us by calling | ||
2077 | * mem_cgroup_oom_synchronize(), possibly putting the | ||
2078 | * task to sleep. | ||
2079 | */ | ||
2080 | current->memcg_oom.oom_locked = locked; | ||
2081 | current->memcg_oom.wakeups = wakeups; | ||
2082 | css_get(&memcg->css); | ||
2083 | current->memcg_oom.wait_on_memcg = memcg; | ||
2084 | } | ||
2085 | } | 2186 | } |
2086 | 2187 | ||
2087 | /** | 2188 | /** |
2088 | * mem_cgroup_oom_synchronize - complete memcg OOM handling | 2189 | * mem_cgroup_oom_synchronize - complete memcg OOM handling |
2190 | * @handle: actually kill/wait or just clean up the OOM state | ||
2089 | * | 2191 | * |
2090 | * This has to be called at the end of a page fault if the the memcg | 2192 | * This has to be called at the end of a page fault if the memcg OOM |
2091 | * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. | 2193 | * handler was enabled. |
2092 | * | 2194 | * |
2093 | * Memcg supports userspace OOM handling, so failed allocations must | 2195 | * Memcg supports userspace OOM handling where failed allocations must |
2094 | * sleep on a waitqueue until the userspace task resolves the | 2196 | * sleep on a waitqueue until the userspace task resolves the |
2095 | * situation. Sleeping directly in the charge context with all kinds | 2197 | * situation. Sleeping directly in the charge context with all kinds |
2096 | * of locks held is not a good idea, instead we remember an OOM state | 2198 | * of locks held is not a good idea, instead we remember an OOM state |
2097 | * in the task and mem_cgroup_oom_synchronize() has to be called at | 2199 | * in the task and mem_cgroup_oom_synchronize() has to be called at |
2098 | * the end of the page fault to put the task to sleep and clean up the | 2200 | * the end of the page fault to complete the OOM handling. |
2099 | * OOM state. | ||
2100 | * | 2201 | * |
2101 | * Returns %true if an ongoing memcg OOM situation was detected and | 2202 | * Returns %true if an ongoing memcg OOM situation was detected and |
2102 | * finalized, %false otherwise. | 2203 | * completed, %false otherwise. |
2103 | */ | 2204 | */ |
2104 | bool mem_cgroup_oom_synchronize(void) | 2205 | bool mem_cgroup_oom_synchronize(bool handle) |
2105 | { | 2206 | { |
2207 | struct mem_cgroup *memcg = current->memcg_oom.memcg; | ||
2106 | struct oom_wait_info owait; | 2208 | struct oom_wait_info owait; |
2107 | struct mem_cgroup *memcg; | 2209 | bool locked; |
2108 | 2210 | ||
2109 | /* OOM is global, do not handle */ | 2211 | /* OOM is global, do not handle */ |
2110 | if (!current->memcg_oom.in_memcg_oom) | ||
2111 | return false; | ||
2112 | |||
2113 | /* | ||
2114 | * We invoked the OOM killer but there is a chance that a kill | ||
2115 | * did not free up any charges. Everybody else might already | ||
2116 | * be sleeping, so restart the fault and keep the rampage | ||
2117 | * going until some charges are released. | ||
2118 | */ | ||
2119 | memcg = current->memcg_oom.wait_on_memcg; | ||
2120 | if (!memcg) | 2212 | if (!memcg) |
2121 | goto out; | 2213 | return false; |
2122 | 2214 | ||
2123 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 2215 | if (!handle) |
2124 | goto out_memcg; | 2216 | goto cleanup; |
2125 | 2217 | ||
2126 | owait.memcg = memcg; | 2218 | owait.memcg = memcg; |
2127 | owait.wait.flags = 0; | 2219 | owait.wait.flags = 0; |
@@ -2130,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void) | |||
2130 | INIT_LIST_HEAD(&owait.wait.task_list); | 2222 | INIT_LIST_HEAD(&owait.wait.task_list); |
2131 | 2223 | ||
2132 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 2224 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
2133 | /* Only sleep if we didn't miss any wakeups since OOM */ | 2225 | mem_cgroup_mark_under_oom(memcg); |
2134 | if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) | 2226 | |
2227 | locked = mem_cgroup_oom_trylock(memcg); | ||
2228 | |||
2229 | if (locked) | ||
2230 | mem_cgroup_oom_notify(memcg); | ||
2231 | |||
2232 | if (locked && !memcg->oom_kill_disable) { | ||
2233 | mem_cgroup_unmark_under_oom(memcg); | ||
2234 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
2235 | mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, | ||
2236 | current->memcg_oom.order); | ||
2237 | } else { | ||
2135 | schedule(); | 2238 | schedule(); |
2136 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2239 | mem_cgroup_unmark_under_oom(memcg); |
2137 | out_memcg: | 2240 | finish_wait(&memcg_oom_waitq, &owait.wait); |
2138 | mem_cgroup_unmark_under_oom(memcg); | 2241 | } |
2139 | if (current->memcg_oom.oom_locked) { | 2242 | |
2243 | if (locked) { | ||
2140 | mem_cgroup_oom_unlock(memcg); | 2244 | mem_cgroup_oom_unlock(memcg); |
2141 | /* | 2245 | /* |
2142 | * There is no guarantee that an OOM-lock contender | 2246 | * There is no guarantee that an OOM-lock contender |
@@ -2145,10 +2249,9 @@ out_memcg: | |||
2145 | */ | 2249 | */ |
2146 | memcg_oom_recover(memcg); | 2250 | memcg_oom_recover(memcg); |
2147 | } | 2251 | } |
2252 | cleanup: | ||
2253 | current->memcg_oom.memcg = NULL; | ||
2148 | css_put(&memcg->css); | 2254 | css_put(&memcg->css); |
2149 | current->memcg_oom.wait_on_memcg = NULL; | ||
2150 | out: | ||
2151 | current->memcg_oom.in_memcg_oom = 0; | ||
2152 | return true; | 2255 | return true; |
2153 | } | 2256 | } |
2154 | 2257 | ||
@@ -2562,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2562 | || fatal_signal_pending(current))) | 2665 | || fatal_signal_pending(current))) |
2563 | goto bypass; | 2666 | goto bypass; |
2564 | 2667 | ||
2668 | if (unlikely(task_in_memcg_oom(current))) | ||
2669 | goto bypass; | ||
2670 | |||
2565 | /* | 2671 | /* |
2566 | * We always charge the cgroup the mm_struct belongs to. | 2672 | * We always charge the cgroup the mm_struct belongs to. |
2567 | * The mm_struct's mem_cgroup changes on task migration if the | 2673 | * The mm_struct's mem_cgroup changes on task migration if the |
@@ -2660,6 +2766,8 @@ done: | |||
2660 | return 0; | 2766 | return 0; |
2661 | nomem: | 2767 | nomem: |
2662 | *ptr = NULL; | 2768 | *ptr = NULL; |
2769 | if (gfp_mask & __GFP_NOFAIL) | ||
2770 | return 0; | ||
2663 | return -ENOMEM; | 2771 | return -ENOMEM; |
2664 | bypass: | 2772 | bypass: |
2665 | *ptr = root_mem_cgroup; | 2773 | *ptr = root_mem_cgroup; |
@@ -2812,7 +2920,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2812 | unlock_page_cgroup(pc); | 2920 | unlock_page_cgroup(pc); |
2813 | 2921 | ||
2814 | /* | 2922 | /* |
2815 | * "charge_statistics" updated event counter. | 2923 | * "charge_statistics" updated event counter. Then, check it. |
2924 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
2925 | * if they exceeds softlimit. | ||
2816 | */ | 2926 | */ |
2817 | memcg_check_events(memcg, page); | 2927 | memcg_check_events(memcg, page); |
2818 | } | 2928 | } |
@@ -4647,6 +4757,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
4647 | return ret; | 4757 | return ret; |
4648 | } | 4758 | } |
4649 | 4759 | ||
4760 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
4761 | gfp_t gfp_mask, | ||
4762 | unsigned long *total_scanned) | ||
4763 | { | ||
4764 | unsigned long nr_reclaimed = 0; | ||
4765 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
4766 | unsigned long reclaimed; | ||
4767 | int loop = 0; | ||
4768 | struct mem_cgroup_tree_per_zone *mctz; | ||
4769 | unsigned long long excess; | ||
4770 | unsigned long nr_scanned; | ||
4771 | |||
4772 | if (order > 0) | ||
4773 | return 0; | ||
4774 | |||
4775 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); | ||
4776 | /* | ||
4777 | * This loop can run a while, specially if mem_cgroup's continuously | ||
4778 | * keep exceeding their soft limit and putting the system under | ||
4779 | * pressure | ||
4780 | */ | ||
4781 | do { | ||
4782 | if (next_mz) | ||
4783 | mz = next_mz; | ||
4784 | else | ||
4785 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
4786 | if (!mz) | ||
4787 | break; | ||
4788 | |||
4789 | nr_scanned = 0; | ||
4790 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, | ||
4791 | gfp_mask, &nr_scanned); | ||
4792 | nr_reclaimed += reclaimed; | ||
4793 | *total_scanned += nr_scanned; | ||
4794 | spin_lock(&mctz->lock); | ||
4795 | |||
4796 | /* | ||
4797 | * If we failed to reclaim anything from this memory cgroup | ||
4798 | * it is time to move on to the next cgroup | ||
4799 | */ | ||
4800 | next_mz = NULL; | ||
4801 | if (!reclaimed) { | ||
4802 | do { | ||
4803 | /* | ||
4804 | * Loop until we find yet another one. | ||
4805 | * | ||
4806 | * By the time we get the soft_limit lock | ||
4807 | * again, someone might have aded the | ||
4808 | * group back on the RB tree. Iterate to | ||
4809 | * make sure we get a different mem. | ||
4810 | * mem_cgroup_largest_soft_limit_node returns | ||
4811 | * NULL if no other cgroup is present on | ||
4812 | * the tree | ||
4813 | */ | ||
4814 | next_mz = | ||
4815 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
4816 | if (next_mz == mz) | ||
4817 | css_put(&next_mz->memcg->css); | ||
4818 | else /* next_mz == NULL or other memcg */ | ||
4819 | break; | ||
4820 | } while (1); | ||
4821 | } | ||
4822 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
4823 | excess = res_counter_soft_limit_excess(&mz->memcg->res); | ||
4824 | /* | ||
4825 | * One school of thought says that we should not add | ||
4826 | * back the node to the tree if reclaim returns 0. | ||
4827 | * But our reclaim could return 0, simply because due | ||
4828 | * to priority we are exposing a smaller subset of | ||
4829 | * memory to reclaim from. Consider this as a longer | ||
4830 | * term TODO. | ||
4831 | */ | ||
4832 | /* If excess == 0, no tree ops */ | ||
4833 | __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); | ||
4834 | spin_unlock(&mctz->lock); | ||
4835 | css_put(&mz->memcg->css); | ||
4836 | loop++; | ||
4837 | /* | ||
4838 | * Could not reclaim anything and there are no more | ||
4839 | * mem cgroups to try or we seem to be looping without | ||
4840 | * reclaiming anything. | ||
4841 | */ | ||
4842 | if (!nr_reclaimed && | ||
4843 | (next_mz == NULL || | ||
4844 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
4845 | break; | ||
4846 | } while (!nr_reclaimed); | ||
4847 | if (next_mz) | ||
4848 | css_put(&next_mz->memcg->css); | ||
4849 | return nr_reclaimed; | ||
4850 | } | ||
4851 | |||
4650 | /** | 4852 | /** |
4651 | * mem_cgroup_force_empty_list - clears LRU of a group | 4853 | * mem_cgroup_force_empty_list - clears LRU of a group |
4652 | * @memcg: group to clear | 4854 | * @memcg: group to clear |
@@ -5911,6 +6113,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
5911 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 6113 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
5912 | mz = &pn->zoneinfo[zone]; | 6114 | mz = &pn->zoneinfo[zone]; |
5913 | lruvec_init(&mz->lruvec); | 6115 | lruvec_init(&mz->lruvec); |
6116 | mz->usage_in_excess = 0; | ||
6117 | mz->on_tree = false; | ||
5914 | mz->memcg = memcg; | 6118 | mz->memcg = memcg; |
5915 | } | 6119 | } |
5916 | memcg->nodeinfo[node] = pn; | 6120 | memcg->nodeinfo[node] = pn; |
@@ -5966,6 +6170,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
5966 | int node; | 6170 | int node; |
5967 | size_t size = memcg_size(); | 6171 | size_t size = memcg_size(); |
5968 | 6172 | ||
6173 | mem_cgroup_remove_from_trees(memcg); | ||
5969 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 6174 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
5970 | 6175 | ||
5971 | for_each_node(node) | 6176 | for_each_node(node) |
@@ -6002,6 +6207,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
6002 | } | 6207 | } |
6003 | EXPORT_SYMBOL(parent_mem_cgroup); | 6208 | EXPORT_SYMBOL(parent_mem_cgroup); |
6004 | 6209 | ||
6210 | static void __init mem_cgroup_soft_limit_tree_init(void) | ||
6211 | { | ||
6212 | struct mem_cgroup_tree_per_node *rtpn; | ||
6213 | struct mem_cgroup_tree_per_zone *rtpz; | ||
6214 | int tmp, node, zone; | ||
6215 | |||
6216 | for_each_node(node) { | ||
6217 | tmp = node; | ||
6218 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
6219 | tmp = -1; | ||
6220 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
6221 | BUG_ON(!rtpn); | ||
6222 | |||
6223 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
6224 | |||
6225 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
6226 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
6227 | rtpz->rb_root = RB_ROOT; | ||
6228 | spin_lock_init(&rtpz->lock); | ||
6229 | } | ||
6230 | } | ||
6231 | } | ||
6232 | |||
6005 | static struct cgroup_subsys_state * __ref | 6233 | static struct cgroup_subsys_state * __ref |
6006 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 6234 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
6007 | { | 6235 | { |
@@ -6031,7 +6259,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
6031 | mutex_init(&memcg->thresholds_lock); | 6259 | mutex_init(&memcg->thresholds_lock); |
6032 | spin_lock_init(&memcg->move_lock); | 6260 | spin_lock_init(&memcg->move_lock); |
6033 | vmpressure_init(&memcg->vmpressure); | 6261 | vmpressure_init(&memcg->vmpressure); |
6034 | spin_lock_init(&memcg->soft_lock); | ||
6035 | 6262 | ||
6036 | return &memcg->css; | 6263 | return &memcg->css; |
6037 | 6264 | ||
@@ -6109,13 +6336,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
6109 | 6336 | ||
6110 | mem_cgroup_invalidate_reclaim_iterators(memcg); | 6337 | mem_cgroup_invalidate_reclaim_iterators(memcg); |
6111 | mem_cgroup_reparent_charges(memcg); | 6338 | mem_cgroup_reparent_charges(memcg); |
6112 | if (memcg->soft_contributed) { | ||
6113 | while ((memcg = parent_mem_cgroup(memcg))) | ||
6114 | atomic_dec(&memcg->children_in_excess); | ||
6115 | |||
6116 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
6117 | atomic_dec(&root_mem_cgroup->children_in_excess); | ||
6118 | } | ||
6119 | mem_cgroup_destroy_all_caches(memcg); | 6339 | mem_cgroup_destroy_all_caches(memcg); |
6120 | vmpressure_cleanup(&memcg->vmpressure); | 6340 | vmpressure_cleanup(&memcg->vmpressure); |
6121 | } | 6341 | } |
@@ -6790,6 +7010,7 @@ static int __init mem_cgroup_init(void) | |||
6790 | { | 7010 | { |
6791 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 7011 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
6792 | enable_swap_cgroup(); | 7012 | enable_swap_cgroup(); |
7013 | mem_cgroup_soft_limit_tree_init(); | ||
6793 | memcg_stock_init(); | 7014 | memcg_stock_init(); |
6794 | return 0; | 7015 | return 0; |
6795 | } | 7016 | } |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 947ed5413279..bf3351b5115e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1114,8 +1114,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1114 | * shake_page could have turned it free. | 1114 | * shake_page could have turned it free. |
1115 | */ | 1115 | */ |
1116 | if (is_free_buddy_page(p)) { | 1116 | if (is_free_buddy_page(p)) { |
1117 | action_result(pfn, "free buddy, 2nd try", | 1117 | if (flags & MF_COUNT_INCREASED) |
1118 | DELAYED); | 1118 | action_result(pfn, "free buddy", DELAYED); |
1119 | else | ||
1120 | action_result(pfn, "free buddy, 2nd try", DELAYED); | ||
1119 | return 0; | 1121 | return 0; |
1120 | } | 1122 | } |
1121 | action_result(pfn, "non LRU", IGNORED); | 1123 | action_result(pfn, "non LRU", IGNORED); |
@@ -1349,7 +1351,7 @@ int unpoison_memory(unsigned long pfn) | |||
1349 | * worked by memory_failure() and the page lock is not held yet. | 1351 | * worked by memory_failure() and the page lock is not held yet. |
1350 | * In such case, we yield to memory_failure() and make unpoison fail. | 1352 | * In such case, we yield to memory_failure() and make unpoison fail. |
1351 | */ | 1353 | */ |
1352 | if (PageTransHuge(page)) { | 1354 | if (!PageHuge(page) && PageTransHuge(page)) { |
1353 | pr_info("MCE: Memory failure is now running on %#lx\n", pfn); | 1355 | pr_info("MCE: Memory failure is now running on %#lx\n", pfn); |
1354 | return 0; | 1356 | return 0; |
1355 | } | 1357 | } |
diff --git a/mm/memory.c b/mm/memory.c index ca0003947115..1311f26497e6 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -837,6 +837,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
837 | */ | 837 | */ |
838 | make_migration_entry_read(&entry); | 838 | make_migration_entry_read(&entry); |
839 | pte = swp_entry_to_pte(entry); | 839 | pte = swp_entry_to_pte(entry); |
840 | if (pte_swp_soft_dirty(*src_pte)) | ||
841 | pte = pte_swp_mksoft_dirty(pte); | ||
840 | set_pte_at(src_mm, addr, src_pte, pte); | 842 | set_pte_at(src_mm, addr, src_pte, pte); |
841 | } | 843 | } |
842 | } | 844 | } |
@@ -3863,15 +3865,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3863 | * space. Kernel faults are handled more gracefully. | 3865 | * space. Kernel faults are handled more gracefully. |
3864 | */ | 3866 | */ |
3865 | if (flags & FAULT_FLAG_USER) | 3867 | if (flags & FAULT_FLAG_USER) |
3866 | mem_cgroup_enable_oom(); | 3868 | mem_cgroup_oom_enable(); |
3867 | 3869 | ||
3868 | ret = __handle_mm_fault(mm, vma, address, flags); | 3870 | ret = __handle_mm_fault(mm, vma, address, flags); |
3869 | 3871 | ||
3870 | if (flags & FAULT_FLAG_USER) | 3872 | if (flags & FAULT_FLAG_USER) { |
3871 | mem_cgroup_disable_oom(); | 3873 | mem_cgroup_oom_disable(); |
3872 | 3874 | /* | |
3873 | if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) | 3875 | * The task may have entered a memcg OOM situation but |
3874 | mem_cgroup_oom_synchronize(); | 3876 | * if the allocation error was handled gracefully (no |
3877 | * VM_FAULT_OOM), there is no need to kill anything. | ||
3878 | * Just clean up the OOM state peacefully. | ||
3879 | */ | ||
3880 | if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) | ||
3881 | mem_cgroup_oom_synchronize(false); | ||
3882 | } | ||
3875 | 3883 | ||
3876 | return ret; | 3884 | return ret; |
3877 | } | 3885 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 9c8d5f59d30b..7a7325ee1d08 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -107,7 +107,7 @@ void putback_movable_pages(struct list_head *l) | |||
107 | list_del(&page->lru); | 107 | list_del(&page->lru); |
108 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 108 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
109 | page_is_file_cache(page)); | 109 | page_is_file_cache(page)); |
110 | if (unlikely(balloon_page_movable(page))) | 110 | if (unlikely(isolated_balloon_page(page))) |
111 | balloon_page_putback(page); | 111 | balloon_page_putback(page); |
112 | else | 112 | else |
113 | putback_lru_page(page); | 113 | putback_lru_page(page); |
@@ -161,6 +161,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
161 | 161 | ||
162 | get_page(new); | 162 | get_page(new); |
163 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 163 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
164 | if (pte_swp_soft_dirty(*ptep)) | ||
165 | pte = pte_mksoft_dirty(pte); | ||
164 | if (is_write_migration_entry(entry)) | 166 | if (is_write_migration_entry(entry)) |
165 | pte = pte_mkwrite(pte); | 167 | pte = pte_mkwrite(pte); |
166 | #ifdef CONFIG_HUGETLB_PAGE | 168 | #ifdef CONFIG_HUGETLB_PAGE |
diff --git a/mm/mlock.c b/mm/mlock.c index d63802663242..d480cd6fc475 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -379,10 +379,14 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, | |||
379 | 379 | ||
380 | /* | 380 | /* |
381 | * Initialize pte walk starting at the already pinned page where we | 381 | * Initialize pte walk starting at the already pinned page where we |
382 | * are sure that there is a pte. | 382 | * are sure that there is a pte, as it was pinned under the same |
383 | * mmap_sem write op. | ||
383 | */ | 384 | */ |
384 | pte = get_locked_pte(vma->vm_mm, start, &ptl); | 385 | pte = get_locked_pte(vma->vm_mm, start, &ptl); |
385 | end = min(end, pmd_addr_end(start, end)); | 386 | /* Make sure we do not cross the page table boundary */ |
387 | end = pgd_addr_end(start, end); | ||
388 | end = pud_addr_end(start, end); | ||
389 | end = pmd_addr_end(start, end); | ||
386 | 390 | ||
387 | /* The page next to the pinned page is the first we will try to get */ | 391 | /* The page next to the pinned page is the first we will try to get */ |
388 | start += PAGE_SIZE; | 392 | start += PAGE_SIZE; |
@@ -736,6 +740,7 @@ static int do_mlockall(int flags) | |||
736 | 740 | ||
737 | /* Ignore errors */ | 741 | /* Ignore errors */ |
738 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); | 742 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); |
743 | cond_resched(); | ||
739 | } | 744 | } |
740 | out: | 745 | out: |
741 | return 0; | 746 | return 0; |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 94722a4d6b43..a3af058f68e4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -94,13 +94,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
94 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 94 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
95 | 95 | ||
96 | if (is_write_migration_entry(entry)) { | 96 | if (is_write_migration_entry(entry)) { |
97 | pte_t newpte; | ||
97 | /* | 98 | /* |
98 | * A protection check is difficult so | 99 | * A protection check is difficult so |
99 | * just be safe and disable write | 100 | * just be safe and disable write |
100 | */ | 101 | */ |
101 | make_migration_entry_read(&entry); | 102 | make_migration_entry_read(&entry); |
102 | set_pte_at(mm, addr, pte, | 103 | newpte = swp_entry_to_pte(entry); |
103 | swp_entry_to_pte(entry)); | 104 | if (pte_swp_soft_dirty(oldpte)) |
105 | newpte = pte_swp_mksoft_dirty(newpte); | ||
106 | set_pte_at(mm, addr, pte, newpte); | ||
104 | } | 107 | } |
105 | pages++; | 108 | pages++; |
106 | } | 109 | } |
diff --git a/mm/mremap.c b/mm/mremap.c index 91b13d6a16d4..0843feb66f3d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -25,7 +25,6 @@ | |||
25 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
26 | #include <asm/cacheflush.h> | 26 | #include <asm/cacheflush.h> |
27 | #include <asm/tlbflush.h> | 27 | #include <asm/tlbflush.h> |
28 | #include <asm/pgalloc.h> | ||
29 | 28 | ||
30 | #include "internal.h" | 29 | #include "internal.h" |
31 | 30 | ||
@@ -63,10 +62,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
63 | return NULL; | 62 | return NULL; |
64 | 63 | ||
65 | pmd = pmd_alloc(mm, pud, addr); | 64 | pmd = pmd_alloc(mm, pud, addr); |
66 | if (!pmd) { | 65 | if (!pmd) |
67 | pud_free(mm, pud); | ||
68 | return NULL; | 66 | return NULL; |
69 | } | ||
70 | 67 | ||
71 | VM_BUG_ON(pmd_trans_huge(*pmd)); | 68 | VM_BUG_ON(pmd_trans_huge(*pmd)); |
72 | 69 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 314e9d274381..6738c47f1f72 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void) | |||
680 | { | 680 | { |
681 | struct zonelist *zonelist; | 681 | struct zonelist *zonelist; |
682 | 682 | ||
683 | if (mem_cgroup_oom_synchronize()) | 683 | if (mem_cgroup_oom_synchronize(true)) |
684 | return; | 684 | return; |
685 | 685 | ||
686 | zonelist = node_zonelist(first_online_node, GFP_KERNEL); | 686 | zonelist = node_zonelist(first_online_node, GFP_KERNEL); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f5236f804aa6..63807583d8e8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1210,11 +1210,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty, | |||
1210 | return 1; | 1210 | return 1; |
1211 | } | 1211 | } |
1212 | 1212 | ||
1213 | static long bdi_max_pause(struct backing_dev_info *bdi, | 1213 | static unsigned long bdi_max_pause(struct backing_dev_info *bdi, |
1214 | unsigned long bdi_dirty) | 1214 | unsigned long bdi_dirty) |
1215 | { | 1215 | { |
1216 | long bw = bdi->avg_write_bandwidth; | 1216 | unsigned long bw = bdi->avg_write_bandwidth; |
1217 | long t; | 1217 | unsigned long t; |
1218 | 1218 | ||
1219 | /* | 1219 | /* |
1220 | * Limit pause time for small memory systems. If sleeping for too long | 1220 | * Limit pause time for small memory systems. If sleeping for too long |
@@ -1226,7 +1226,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi, | |||
1226 | t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); | 1226 | t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); |
1227 | t++; | 1227 | t++; |
1228 | 1228 | ||
1229 | return min_t(long, t, MAX_PAUSE); | 1229 | return min_t(unsigned long, t, MAX_PAUSE); |
1230 | } | 1230 | } |
1231 | 1231 | ||
1232 | static long bdi_min_pause(struct backing_dev_info *bdi, | 1232 | static long bdi_min_pause(struct backing_dev_info *bdi, |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0ee638f76ebe..dd886fac451a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -6366,10 +6366,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
6366 | list_del(&page->lru); | 6366 | list_del(&page->lru); |
6367 | rmv_page_order(page); | 6367 | rmv_page_order(page); |
6368 | zone->free_area[order].nr_free--; | 6368 | zone->free_area[order].nr_free--; |
6369 | #ifdef CONFIG_HIGHMEM | ||
6370 | if (PageHighMem(page)) | ||
6371 | totalhigh_pages -= 1 << order; | ||
6372 | #endif | ||
6373 | for (i = 0; i < (1 << order); i++) | 6369 | for (i = 0; i < (1 << order); i++) |
6374 | SetPageReserved((page+i)); | 6370 | SetPageReserved((page+i)); |
6375 | pfn += (1 << order); | 6371 | pfn += (1 << order); |
diff --git a/mm/slab_common.c b/mm/slab_common.c index a3443278ce3a..e2e98af703ea 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -56,6 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, | |||
56 | continue; | 56 | continue; |
57 | } | 57 | } |
58 | 58 | ||
59 | #if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) | ||
59 | /* | 60 | /* |
60 | * For simplicity, we won't check this in the list of memcg | 61 | * For simplicity, we won't check this in the list of memcg |
61 | * caches. We have control over memcg naming, and if there | 62 | * caches. We have control over memcg naming, and if there |
@@ -69,6 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, | |||
69 | s = NULL; | 70 | s = NULL; |
70 | return -EINVAL; | 71 | return -EINVAL; |
71 | } | 72 | } |
73 | #endif | ||
72 | } | 74 | } |
73 | 75 | ||
74 | WARN_ON(strchr(name, ' ')); /* It confuses parsers */ | 76 | WARN_ON(strchr(name, ' ')); /* It confuses parsers */ |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 3963fc24fcc1..de7c904e52e5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -1824,6 +1824,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1824 | struct filename *pathname; | 1824 | struct filename *pathname; |
1825 | int i, type, prev; | 1825 | int i, type, prev; |
1826 | int err; | 1826 | int err; |
1827 | unsigned int old_block_size; | ||
1827 | 1828 | ||
1828 | if (!capable(CAP_SYS_ADMIN)) | 1829 | if (!capable(CAP_SYS_ADMIN)) |
1829 | return -EPERM; | 1830 | return -EPERM; |
@@ -1914,6 +1915,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1914 | } | 1915 | } |
1915 | 1916 | ||
1916 | swap_file = p->swap_file; | 1917 | swap_file = p->swap_file; |
1918 | old_block_size = p->old_block_size; | ||
1917 | p->swap_file = NULL; | 1919 | p->swap_file = NULL; |
1918 | p->max = 0; | 1920 | p->max = 0; |
1919 | swap_map = p->swap_map; | 1921 | swap_map = p->swap_map; |
@@ -1938,7 +1940,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1938 | inode = mapping->host; | 1940 | inode = mapping->host; |
1939 | if (S_ISBLK(inode->i_mode)) { | 1941 | if (S_ISBLK(inode->i_mode)) { |
1940 | struct block_device *bdev = I_BDEV(inode); | 1942 | struct block_device *bdev = I_BDEV(inode); |
1941 | set_blocksize(bdev, p->old_block_size); | 1943 | set_blocksize(bdev, old_block_size); |
1942 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 1944 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
1943 | } else { | 1945 | } else { |
1944 | mutex_lock(&inode->i_mutex); | 1946 | mutex_lock(&inode->i_mutex); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ed1b775bdc9..eea668d9cff6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <asm/div64.h> | 48 | #include <asm/div64.h> |
49 | 49 | ||
50 | #include <linux/swapops.h> | 50 | #include <linux/swapops.h> |
51 | #include <linux/balloon_compaction.h> | ||
51 | 52 | ||
52 | #include "internal.h" | 53 | #include "internal.h" |
53 | 54 | ||
@@ -139,23 +140,11 @@ static bool global_reclaim(struct scan_control *sc) | |||
139 | { | 140 | { |
140 | return !sc->target_mem_cgroup; | 141 | return !sc->target_mem_cgroup; |
141 | } | 142 | } |
142 | |||
143 | static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) | ||
144 | { | ||
145 | struct mem_cgroup *root = sc->target_mem_cgroup; | ||
146 | return !mem_cgroup_disabled() && | ||
147 | mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE; | ||
148 | } | ||
149 | #else | 143 | #else |
150 | static bool global_reclaim(struct scan_control *sc) | 144 | static bool global_reclaim(struct scan_control *sc) |
151 | { | 145 | { |
152 | return true; | 146 | return true; |
153 | } | 147 | } |
154 | |||
155 | static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) | ||
156 | { | ||
157 | return false; | ||
158 | } | ||
159 | #endif | 148 | #endif |
160 | 149 | ||
161 | unsigned long zone_reclaimable_pages(struct zone *zone) | 150 | unsigned long zone_reclaimable_pages(struct zone *zone) |
@@ -222,6 +211,7 @@ void unregister_shrinker(struct shrinker *shrinker) | |||
222 | down_write(&shrinker_rwsem); | 211 | down_write(&shrinker_rwsem); |
223 | list_del(&shrinker->list); | 212 | list_del(&shrinker->list); |
224 | up_write(&shrinker_rwsem); | 213 | up_write(&shrinker_rwsem); |
214 | kfree(shrinker->nr_deferred); | ||
225 | } | 215 | } |
226 | EXPORT_SYMBOL(unregister_shrinker); | 216 | EXPORT_SYMBOL(unregister_shrinker); |
227 | 217 | ||
@@ -1125,7 +1115,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
1125 | LIST_HEAD(clean_pages); | 1115 | LIST_HEAD(clean_pages); |
1126 | 1116 | ||
1127 | list_for_each_entry_safe(page, next, page_list, lru) { | 1117 | list_for_each_entry_safe(page, next, page_list, lru) { |
1128 | if (page_is_file_cache(page) && !PageDirty(page)) { | 1118 | if (page_is_file_cache(page) && !PageDirty(page) && |
1119 | !isolated_balloon_page(page)) { | ||
1129 | ClearPageActive(page); | 1120 | ClearPageActive(page); |
1130 | list_move(&page->lru, &clean_pages); | 1121 | list_move(&page->lru, &clean_pages); |
1131 | } | 1122 | } |
@@ -2176,11 +2167,9 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2176 | } | 2167 | } |
2177 | } | 2168 | } |
2178 | 2169 | ||
2179 | static int | 2170 | static void shrink_zone(struct zone *zone, struct scan_control *sc) |
2180 | __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | ||
2181 | { | 2171 | { |
2182 | unsigned long nr_reclaimed, nr_scanned; | 2172 | unsigned long nr_reclaimed, nr_scanned; |
2183 | int groups_scanned = 0; | ||
2184 | 2173 | ||
2185 | do { | 2174 | do { |
2186 | struct mem_cgroup *root = sc->target_mem_cgroup; | 2175 | struct mem_cgroup *root = sc->target_mem_cgroup; |
@@ -2188,17 +2177,15 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | |||
2188 | .zone = zone, | 2177 | .zone = zone, |
2189 | .priority = sc->priority, | 2178 | .priority = sc->priority, |
2190 | }; | 2179 | }; |
2191 | struct mem_cgroup *memcg = NULL; | 2180 | struct mem_cgroup *memcg; |
2192 | mem_cgroup_iter_filter filter = (soft_reclaim) ? | ||
2193 | mem_cgroup_soft_reclaim_eligible : NULL; | ||
2194 | 2181 | ||
2195 | nr_reclaimed = sc->nr_reclaimed; | 2182 | nr_reclaimed = sc->nr_reclaimed; |
2196 | nr_scanned = sc->nr_scanned; | 2183 | nr_scanned = sc->nr_scanned; |
2197 | 2184 | ||
2198 | while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { | 2185 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
2186 | do { | ||
2199 | struct lruvec *lruvec; | 2187 | struct lruvec *lruvec; |
2200 | 2188 | ||
2201 | groups_scanned++; | ||
2202 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2189 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2203 | 2190 | ||
2204 | shrink_lruvec(lruvec, sc); | 2191 | shrink_lruvec(lruvec, sc); |
@@ -2218,7 +2205,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | |||
2218 | mem_cgroup_iter_break(root, memcg); | 2205 | mem_cgroup_iter_break(root, memcg); |
2219 | break; | 2206 | break; |
2220 | } | 2207 | } |
2221 | } | 2208 | memcg = mem_cgroup_iter(root, memcg, &reclaim); |
2209 | } while (memcg); | ||
2222 | 2210 | ||
2223 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | 2211 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, |
2224 | sc->nr_scanned - nr_scanned, | 2212 | sc->nr_scanned - nr_scanned, |
@@ -2226,37 +2214,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | |||
2226 | 2214 | ||
2227 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | 2215 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, |
2228 | sc->nr_scanned - nr_scanned, sc)); | 2216 | sc->nr_scanned - nr_scanned, sc)); |
2229 | |||
2230 | return groups_scanned; | ||
2231 | } | ||
2232 | |||
2233 | |||
2234 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | ||
2235 | { | ||
2236 | bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); | ||
2237 | unsigned long nr_scanned = sc->nr_scanned; | ||
2238 | int scanned_groups; | ||
2239 | |||
2240 | scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); | ||
2241 | /* | ||
2242 | * memcg iterator might race with other reclaimer or start from | ||
2243 | * a incomplete tree walk so the tree walk in __shrink_zone | ||
2244 | * might have missed groups that are above the soft limit. Try | ||
2245 | * another loop to catch up with others. Do it just once to | ||
2246 | * prevent from reclaim latencies when other reclaimers always | ||
2247 | * preempt this one. | ||
2248 | */ | ||
2249 | if (do_soft_reclaim && !scanned_groups) | ||
2250 | __shrink_zone(zone, sc, do_soft_reclaim); | ||
2251 | |||
2252 | /* | ||
2253 | * No group is over the soft limit or those that are do not have | ||
2254 | * pages in the zone we are reclaiming so we have to reclaim everybody | ||
2255 | */ | ||
2256 | if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { | ||
2257 | __shrink_zone(zone, sc, false); | ||
2258 | return; | ||
2259 | } | ||
2260 | } | 2217 | } |
2261 | 2218 | ||
2262 | /* Returns true if compaction should go ahead for a high-order request */ | 2219 | /* Returns true if compaction should go ahead for a high-order request */ |
@@ -2320,6 +2277,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2320 | { | 2277 | { |
2321 | struct zoneref *z; | 2278 | struct zoneref *z; |
2322 | struct zone *zone; | 2279 | struct zone *zone; |
2280 | unsigned long nr_soft_reclaimed; | ||
2281 | unsigned long nr_soft_scanned; | ||
2323 | bool aborted_reclaim = false; | 2282 | bool aborted_reclaim = false; |
2324 | 2283 | ||
2325 | /* | 2284 | /* |
@@ -2359,6 +2318,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2359 | continue; | 2318 | continue; |
2360 | } | 2319 | } |
2361 | } | 2320 | } |
2321 | /* | ||
2322 | * This steals pages from memory cgroups over softlimit | ||
2323 | * and returns the number of reclaimed pages and | ||
2324 | * scanned pages. This works for global memory pressure | ||
2325 | * and balancing, not for a memcg's limit. | ||
2326 | */ | ||
2327 | nr_soft_scanned = 0; | ||
2328 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
2329 | sc->order, sc->gfp_mask, | ||
2330 | &nr_soft_scanned); | ||
2331 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
2332 | sc->nr_scanned += nr_soft_scanned; | ||
2362 | /* need some check for avoid more shrink_zone() */ | 2333 | /* need some check for avoid more shrink_zone() */ |
2363 | } | 2334 | } |
2364 | 2335 | ||
@@ -2952,6 +2923,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2952 | { | 2923 | { |
2953 | int i; | 2924 | int i; |
2954 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2925 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2926 | unsigned long nr_soft_reclaimed; | ||
2927 | unsigned long nr_soft_scanned; | ||
2955 | struct scan_control sc = { | 2928 | struct scan_control sc = { |
2956 | .gfp_mask = GFP_KERNEL, | 2929 | .gfp_mask = GFP_KERNEL, |
2957 | .priority = DEF_PRIORITY, | 2930 | .priority = DEF_PRIORITY, |
@@ -3066,6 +3039,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3066 | 3039 | ||
3067 | sc.nr_scanned = 0; | 3040 | sc.nr_scanned = 0; |
3068 | 3041 | ||
3042 | nr_soft_scanned = 0; | ||
3043 | /* | ||
3044 | * Call soft limit reclaim before calling shrink_zone. | ||
3045 | */ | ||
3046 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
3047 | order, sc.gfp_mask, | ||
3048 | &nr_soft_scanned); | ||
3049 | sc.nr_reclaimed += nr_soft_reclaimed; | ||
3050 | |||
3069 | /* | 3051 | /* |
3070 | * There should be no need to raise the scanning | 3052 | * There should be no need to raise the scanning |
3071 | * priority if enough pages are already being scanned | 3053 | * priority if enough pages are already being scanned |
diff --git a/mm/zswap.c b/mm/zswap.c index 841e35f1db22..d93510c6aa2d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -804,6 +804,10 @@ static void zswap_frontswap_invalidate_area(unsigned type) | |||
804 | } | 804 | } |
805 | tree->rbroot = RB_ROOT; | 805 | tree->rbroot = RB_ROOT; |
806 | spin_unlock(&tree->lock); | 806 | spin_unlock(&tree->lock); |
807 | |||
808 | zbud_destroy_pool(tree->pool); | ||
809 | kfree(tree); | ||
810 | zswap_trees[type] = NULL; | ||
807 | } | 811 | } |
808 | 812 | ||
809 | static struct zbud_ops zswap_zbud_ops = { | 813 | static struct zbud_ops zswap_zbud_ops = { |