diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-12 18:44:27 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-09-12 18:44:27 -0400 |
commit | ac4de9543aca59f2b763746647577302fbedd57e (patch) | |
tree | 40407750569ee030de56233c41c9a97f7e89cf67 /mm | |
parent | 26935fb06ee88f1188789807687c03041f3c70d9 (diff) | |
parent | de32a8177f64bc62e1b19c685dd391af664ab13f (diff) |
Merge branch 'akpm' (patches from Andrew Morton)
Merge more patches from Andrew Morton:
"The rest of MM. Plus one misc cleanup"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (35 commits)
mm/Kconfig: add MMU dependency for MIGRATION.
kernel: replace strict_strto*() with kstrto*()
mm, thp: count thp_fault_fallback anytime thp fault fails
thp: consolidate code between handle_mm_fault() and do_huge_pmd_anonymous_page()
thp: do_huge_pmd_anonymous_page() cleanup
thp: move maybe_pmd_mkwrite() out of mk_huge_pmd()
mm: cleanup add_to_page_cache_locked()
thp: account anon transparent huge pages into NR_ANON_PAGES
truncate: drop 'oldsize' truncate_pagecache() parameter
mm: make lru_add_drain_all() selective
memcg: document cgroup dirty/writeback memory statistics
memcg: add per cgroup writeback pages accounting
memcg: check for proper lock held in mem_cgroup_update_page_stat
memcg: remove MEMCG_NR_FILE_MAPPED
memcg: reduce function dereference
memcg: avoid overflow caused by PAGE_ALIGN
memcg: rename RESOURCE_MAX to RES_COUNTER_MAX
memcg: correct RESOURCE_MAX to ULLONG_MAX
mm: memcg: do not trap chargers with full callstack on OOM
mm: memcg: rework and document OOM waiting and wakeup
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/filemap.c | 59 | ||||
-rw-r--r-- | mm/huge_memory.c | 129 | ||||
-rw-r--r-- | mm/memcontrol.c | 871 | ||||
-rw-r--r-- | mm/memory.c | 52 | ||||
-rw-r--r-- | mm/oom_kill.c | 7 | ||||
-rw-r--r-- | mm/page-writeback.c | 15 | ||||
-rw-r--r-- | mm/rmap.c | 22 | ||||
-rw-r--r-- | mm/swap.c | 44 | ||||
-rw-r--r-- | mm/truncate.c | 9 | ||||
-rw-r--r-- | mm/vmscan.c | 83 |
11 files changed, 593 insertions, 702 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 6cdd27043303..026771a9b097 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -245,7 +245,7 @@ config COMPACTION | |||
245 | config MIGRATION | 245 | config MIGRATION |
246 | bool "Page migration" | 246 | bool "Page migration" |
247 | def_bool y | 247 | def_bool y |
248 | depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA | 248 | depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU |
249 | help | 249 | help |
250 | Allows the migration of the physical location of pages of processes | 250 | Allows the migration of the physical location of pages of processes |
251 | while the virtual addresses are not changed. This is useful in | 251 | while the virtual addresses are not changed. This is useful in |
@@ -480,7 +480,7 @@ config FRONTSWAP | |||
480 | 480 | ||
481 | config CMA | 481 | config CMA |
482 | bool "Contiguous Memory Allocator" | 482 | bool "Contiguous Memory Allocator" |
483 | depends on HAVE_MEMBLOCK | 483 | depends on HAVE_MEMBLOCK && MMU |
484 | select MIGRATION | 484 | select MIGRATION |
485 | select MEMORY_ISOLATION | 485 | select MEMORY_ISOLATION |
486 | help | 486 | help |
diff --git a/mm/filemap.c b/mm/filemap.c index e607728db4a8..1e6aec4a2d2e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -467,32 +467,34 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
467 | error = mem_cgroup_cache_charge(page, current->mm, | 467 | error = mem_cgroup_cache_charge(page, current->mm, |
468 | gfp_mask & GFP_RECLAIM_MASK); | 468 | gfp_mask & GFP_RECLAIM_MASK); |
469 | if (error) | 469 | if (error) |
470 | goto out; | 470 | return error; |
471 | 471 | ||
472 | error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); | 472 | error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); |
473 | if (error == 0) { | 473 | if (error) { |
474 | page_cache_get(page); | ||
475 | page->mapping = mapping; | ||
476 | page->index = offset; | ||
477 | |||
478 | spin_lock_irq(&mapping->tree_lock); | ||
479 | error = radix_tree_insert(&mapping->page_tree, offset, page); | ||
480 | if (likely(!error)) { | ||
481 | mapping->nrpages++; | ||
482 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
483 | spin_unlock_irq(&mapping->tree_lock); | ||
484 | trace_mm_filemap_add_to_page_cache(page); | ||
485 | } else { | ||
486 | page->mapping = NULL; | ||
487 | /* Leave page->index set: truncation relies upon it */ | ||
488 | spin_unlock_irq(&mapping->tree_lock); | ||
489 | mem_cgroup_uncharge_cache_page(page); | ||
490 | page_cache_release(page); | ||
491 | } | ||
492 | radix_tree_preload_end(); | ||
493 | } else | ||
494 | mem_cgroup_uncharge_cache_page(page); | 474 | mem_cgroup_uncharge_cache_page(page); |
495 | out: | 475 | return error; |
476 | } | ||
477 | |||
478 | page_cache_get(page); | ||
479 | page->mapping = mapping; | ||
480 | page->index = offset; | ||
481 | |||
482 | spin_lock_irq(&mapping->tree_lock); | ||
483 | error = radix_tree_insert(&mapping->page_tree, offset, page); | ||
484 | radix_tree_preload_end(); | ||
485 | if (unlikely(error)) | ||
486 | goto err_insert; | ||
487 | mapping->nrpages++; | ||
488 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
489 | spin_unlock_irq(&mapping->tree_lock); | ||
490 | trace_mm_filemap_add_to_page_cache(page); | ||
491 | return 0; | ||
492 | err_insert: | ||
493 | page->mapping = NULL; | ||
494 | /* Leave page->index set: truncation relies upon it */ | ||
495 | spin_unlock_irq(&mapping->tree_lock); | ||
496 | mem_cgroup_uncharge_cache_page(page); | ||
497 | page_cache_release(page); | ||
496 | return error; | 498 | return error; |
497 | } | 499 | } |
498 | EXPORT_SYMBOL(add_to_page_cache_locked); | 500 | EXPORT_SYMBOL(add_to_page_cache_locked); |
@@ -1614,6 +1616,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1614 | struct inode *inode = mapping->host; | 1616 | struct inode *inode = mapping->host; |
1615 | pgoff_t offset = vmf->pgoff; | 1617 | pgoff_t offset = vmf->pgoff; |
1616 | struct page *page; | 1618 | struct page *page; |
1619 | bool memcg_oom; | ||
1617 | pgoff_t size; | 1620 | pgoff_t size; |
1618 | int ret = 0; | 1621 | int ret = 0; |
1619 | 1622 | ||
@@ -1622,7 +1625,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1622 | return VM_FAULT_SIGBUS; | 1625 | return VM_FAULT_SIGBUS; |
1623 | 1626 | ||
1624 | /* | 1627 | /* |
1625 | * Do we have something in the page cache already? | 1628 | * Do we have something in the page cache already? Either |
1629 | * way, try readahead, but disable the memcg OOM killer for it | ||
1630 | * as readahead is optional and no errors are propagated up | ||
1631 | * the fault stack. The OOM killer is enabled while trying to | ||
1632 | * instantiate the faulting page individually below. | ||
1626 | */ | 1633 | */ |
1627 | page = find_get_page(mapping, offset); | 1634 | page = find_get_page(mapping, offset); |
1628 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { | 1635 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { |
@@ -1630,10 +1637,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1630 | * We found the page, so try async readahead before | 1637 | * We found the page, so try async readahead before |
1631 | * waiting for the lock. | 1638 | * waiting for the lock. |
1632 | */ | 1639 | */ |
1640 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
1633 | do_async_mmap_readahead(vma, ra, file, page, offset); | 1641 | do_async_mmap_readahead(vma, ra, file, page, offset); |
1642 | mem_cgroup_toggle_oom(memcg_oom); | ||
1634 | } else if (!page) { | 1643 | } else if (!page) { |
1635 | /* No page in the page cache at all */ | 1644 | /* No page in the page cache at all */ |
1645 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
1636 | do_sync_mmap_readahead(vma, ra, file, offset); | 1646 | do_sync_mmap_readahead(vma, ra, file, offset); |
1647 | mem_cgroup_toggle_oom(memcg_oom); | ||
1637 | count_vm_event(PGMAJFAULT); | 1648 | count_vm_event(PGMAJFAULT); |
1638 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | 1649 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
1639 | ret = VM_FAULT_MAJOR; | 1650 | ret = VM_FAULT_MAJOR; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d66010e0049d..7489884682d8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -695,11 +695,10 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | |||
695 | return pmd; | 695 | return pmd; |
696 | } | 696 | } |
697 | 697 | ||
698 | static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) | 698 | static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) |
699 | { | 699 | { |
700 | pmd_t entry; | 700 | pmd_t entry; |
701 | entry = mk_pmd(page, vma->vm_page_prot); | 701 | entry = mk_pmd(page, prot); |
702 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
703 | entry = pmd_mkhuge(entry); | 702 | entry = pmd_mkhuge(entry); |
704 | return entry; | 703 | return entry; |
705 | } | 704 | } |
@@ -732,7 +731,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
732 | pte_free(mm, pgtable); | 731 | pte_free(mm, pgtable); |
733 | } else { | 732 | } else { |
734 | pmd_t entry; | 733 | pmd_t entry; |
735 | entry = mk_huge_pmd(page, vma); | 734 | entry = mk_huge_pmd(page, vma->vm_page_prot); |
735 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
736 | page_add_new_anon_rmap(page, vma, haddr); | 736 | page_add_new_anon_rmap(page, vma, haddr); |
737 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 737 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
738 | set_pmd_at(mm, haddr, pmd, entry); | 738 | set_pmd_at(mm, haddr, pmd, entry); |
@@ -788,77 +788,57 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
788 | { | 788 | { |
789 | struct page *page; | 789 | struct page *page; |
790 | unsigned long haddr = address & HPAGE_PMD_MASK; | 790 | unsigned long haddr = address & HPAGE_PMD_MASK; |
791 | pte_t *pte; | ||
792 | 791 | ||
793 | if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { | 792 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) |
794 | if (unlikely(anon_vma_prepare(vma))) | 793 | return VM_FAULT_FALLBACK; |
795 | return VM_FAULT_OOM; | 794 | if (unlikely(anon_vma_prepare(vma))) |
796 | if (unlikely(khugepaged_enter(vma))) | 795 | return VM_FAULT_OOM; |
796 | if (unlikely(khugepaged_enter(vma))) | ||
797 | return VM_FAULT_OOM; | ||
798 | if (!(flags & FAULT_FLAG_WRITE) && | ||
799 | transparent_hugepage_use_zero_page()) { | ||
800 | pgtable_t pgtable; | ||
801 | struct page *zero_page; | ||
802 | bool set; | ||
803 | pgtable = pte_alloc_one(mm, haddr); | ||
804 | if (unlikely(!pgtable)) | ||
797 | return VM_FAULT_OOM; | 805 | return VM_FAULT_OOM; |
798 | if (!(flags & FAULT_FLAG_WRITE) && | 806 | zero_page = get_huge_zero_page(); |
799 | transparent_hugepage_use_zero_page()) { | 807 | if (unlikely(!zero_page)) { |
800 | pgtable_t pgtable; | 808 | pte_free(mm, pgtable); |
801 | struct page *zero_page; | ||
802 | bool set; | ||
803 | pgtable = pte_alloc_one(mm, haddr); | ||
804 | if (unlikely(!pgtable)) | ||
805 | return VM_FAULT_OOM; | ||
806 | zero_page = get_huge_zero_page(); | ||
807 | if (unlikely(!zero_page)) { | ||
808 | pte_free(mm, pgtable); | ||
809 | count_vm_event(THP_FAULT_FALLBACK); | ||
810 | goto out; | ||
811 | } | ||
812 | spin_lock(&mm->page_table_lock); | ||
813 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | ||
814 | zero_page); | ||
815 | spin_unlock(&mm->page_table_lock); | ||
816 | if (!set) { | ||
817 | pte_free(mm, pgtable); | ||
818 | put_huge_zero_page(); | ||
819 | } | ||
820 | return 0; | ||
821 | } | ||
822 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | ||
823 | vma, haddr, numa_node_id(), 0); | ||
824 | if (unlikely(!page)) { | ||
825 | count_vm_event(THP_FAULT_FALLBACK); | 809 | count_vm_event(THP_FAULT_FALLBACK); |
826 | goto out; | 810 | return VM_FAULT_FALLBACK; |
827 | } | ||
828 | count_vm_event(THP_FAULT_ALLOC); | ||
829 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { | ||
830 | put_page(page); | ||
831 | goto out; | ||
832 | } | 811 | } |
833 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, | 812 | spin_lock(&mm->page_table_lock); |
834 | page))) { | 813 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, |
835 | mem_cgroup_uncharge_page(page); | 814 | zero_page); |
836 | put_page(page); | 815 | spin_unlock(&mm->page_table_lock); |
837 | goto out; | 816 | if (!set) { |
817 | pte_free(mm, pgtable); | ||
818 | put_huge_zero_page(); | ||
838 | } | 819 | } |
839 | |||
840 | return 0; | 820 | return 0; |
841 | } | 821 | } |
842 | out: | 822 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
843 | /* | 823 | vma, haddr, numa_node_id(), 0); |
844 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 824 | if (unlikely(!page)) { |
845 | * run pte_offset_map on the pmd, if an huge pmd could | 825 | count_vm_event(THP_FAULT_FALLBACK); |
846 | * materialize from under us from a different thread. | 826 | return VM_FAULT_FALLBACK; |
847 | */ | 827 | } |
848 | if (unlikely(pmd_none(*pmd)) && | 828 | if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { |
849 | unlikely(__pte_alloc(mm, vma, pmd, address))) | 829 | put_page(page); |
850 | return VM_FAULT_OOM; | 830 | count_vm_event(THP_FAULT_FALLBACK); |
851 | /* if an huge pmd materialized from under us just retry later */ | 831 | return VM_FAULT_FALLBACK; |
852 | if (unlikely(pmd_trans_huge(*pmd))) | 832 | } |
853 | return 0; | 833 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { |
854 | /* | 834 | mem_cgroup_uncharge_page(page); |
855 | * A regular pmd is established and it can't morph into a huge pmd | 835 | put_page(page); |
856 | * from under us anymore at this point because we hold the mmap_sem | 836 | count_vm_event(THP_FAULT_FALLBACK); |
857 | * read mode and khugepaged takes it in write mode. So now it's | 837 | return VM_FAULT_FALLBACK; |
858 | * safe to run pte_offset_map(). | 838 | } |
859 | */ | 839 | |
860 | pte = pte_offset_map(pmd, address); | 840 | count_vm_event(THP_FAULT_ALLOC); |
861 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 841 | return 0; |
862 | } | 842 | } |
863 | 843 | ||
864 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 844 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
@@ -1170,7 +1150,6 @@ alloc: | |||
1170 | new_page = NULL; | 1150 | new_page = NULL; |
1171 | 1151 | ||
1172 | if (unlikely(!new_page)) { | 1152 | if (unlikely(!new_page)) { |
1173 | count_vm_event(THP_FAULT_FALLBACK); | ||
1174 | if (is_huge_zero_pmd(orig_pmd)) { | 1153 | if (is_huge_zero_pmd(orig_pmd)) { |
1175 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, | 1154 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, |
1176 | address, pmd, orig_pmd, haddr); | 1155 | address, pmd, orig_pmd, haddr); |
@@ -1181,9 +1160,9 @@ alloc: | |||
1181 | split_huge_page(page); | 1160 | split_huge_page(page); |
1182 | put_page(page); | 1161 | put_page(page); |
1183 | } | 1162 | } |
1163 | count_vm_event(THP_FAULT_FALLBACK); | ||
1184 | goto out; | 1164 | goto out; |
1185 | } | 1165 | } |
1186 | count_vm_event(THP_FAULT_ALLOC); | ||
1187 | 1166 | ||
1188 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1167 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
1189 | put_page(new_page); | 1168 | put_page(new_page); |
@@ -1191,10 +1170,13 @@ alloc: | |||
1191 | split_huge_page(page); | 1170 | split_huge_page(page); |
1192 | put_page(page); | 1171 | put_page(page); |
1193 | } | 1172 | } |
1173 | count_vm_event(THP_FAULT_FALLBACK); | ||
1194 | ret |= VM_FAULT_OOM; | 1174 | ret |= VM_FAULT_OOM; |
1195 | goto out; | 1175 | goto out; |
1196 | } | 1176 | } |
1197 | 1177 | ||
1178 | count_vm_event(THP_FAULT_ALLOC); | ||
1179 | |||
1198 | if (is_huge_zero_pmd(orig_pmd)) | 1180 | if (is_huge_zero_pmd(orig_pmd)) |
1199 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); | 1181 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); |
1200 | else | 1182 | else |
@@ -1215,7 +1197,8 @@ alloc: | |||
1215 | goto out_mn; | 1197 | goto out_mn; |
1216 | } else { | 1198 | } else { |
1217 | pmd_t entry; | 1199 | pmd_t entry; |
1218 | entry = mk_huge_pmd(new_page, vma); | 1200 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); |
1201 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
1219 | pmdp_clear_flush(vma, haddr, pmd); | 1202 | pmdp_clear_flush(vma, haddr, pmd); |
1220 | page_add_new_anon_rmap(new_page, vma, haddr); | 1203 | page_add_new_anon_rmap(new_page, vma, haddr); |
1221 | set_pmd_at(mm, haddr, pmd, entry); | 1204 | set_pmd_at(mm, haddr, pmd, entry); |
@@ -1666,7 +1649,6 @@ static void __split_huge_page_refcount(struct page *page, | |||
1666 | BUG_ON(atomic_read(&page->_count) <= 0); | 1649 | BUG_ON(atomic_read(&page->_count) <= 0); |
1667 | 1650 | ||
1668 | __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); | 1651 | __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); |
1669 | __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); | ||
1670 | 1652 | ||
1671 | ClearPageCompound(page); | 1653 | ClearPageCompound(page); |
1672 | compound_unlock(page); | 1654 | compound_unlock(page); |
@@ -2364,7 +2346,8 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2364 | __SetPageUptodate(new_page); | 2346 | __SetPageUptodate(new_page); |
2365 | pgtable = pmd_pgtable(_pmd); | 2347 | pgtable = pmd_pgtable(_pmd); |
2366 | 2348 | ||
2367 | _pmd = mk_huge_pmd(new_page, vma); | 2349 | _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); |
2350 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | ||
2368 | 2351 | ||
2369 | /* | 2352 | /* |
2370 | * spin_lock() below is not the equivalent of smp_wmb(), so | 2353 | * spin_lock() below is not the equivalent of smp_wmb(), so |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c6bd28edd533..d5ff3ce13029 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -39,7 +39,6 @@ | |||
39 | #include <linux/limits.h> | 39 | #include <linux/limits.h> |
40 | #include <linux/export.h> | 40 | #include <linux/export.h> |
41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/rbtree.h> | ||
43 | #include <linux/slab.h> | 42 | #include <linux/slab.h> |
44 | #include <linux/swap.h> | 43 | #include <linux/swap.h> |
45 | #include <linux/swapops.h> | 44 | #include <linux/swapops.h> |
@@ -85,26 +84,12 @@ static int really_do_swap_account __initdata = 0; | |||
85 | #endif | 84 | #endif |
86 | 85 | ||
87 | 86 | ||
88 | /* | ||
89 | * Statistics for memory cgroup. | ||
90 | */ | ||
91 | enum mem_cgroup_stat_index { | ||
92 | /* | ||
93 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. | ||
94 | */ | ||
95 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | ||
96 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | ||
97 | MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ | ||
98 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | ||
99 | MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ | ||
100 | MEM_CGROUP_STAT_NSTATS, | ||
101 | }; | ||
102 | |||
103 | static const char * const mem_cgroup_stat_names[] = { | 87 | static const char * const mem_cgroup_stat_names[] = { |
104 | "cache", | 88 | "cache", |
105 | "rss", | 89 | "rss", |
106 | "rss_huge", | 90 | "rss_huge", |
107 | "mapped_file", | 91 | "mapped_file", |
92 | "writeback", | ||
108 | "swap", | 93 | "swap", |
109 | }; | 94 | }; |
110 | 95 | ||
@@ -175,10 +160,6 @@ struct mem_cgroup_per_zone { | |||
175 | 160 | ||
176 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 161 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
177 | 162 | ||
178 | struct rb_node tree_node; /* RB tree node */ | ||
179 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
180 | /* the soft limit is exceeded*/ | ||
181 | bool on_tree; | ||
182 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ | 163 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
183 | /* use container_of */ | 164 | /* use container_of */ |
184 | }; | 165 | }; |
@@ -187,26 +168,6 @@ struct mem_cgroup_per_node { | |||
187 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 168 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
188 | }; | 169 | }; |
189 | 170 | ||
190 | /* | ||
191 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
192 | * their hierarchy representation | ||
193 | */ | ||
194 | |||
195 | struct mem_cgroup_tree_per_zone { | ||
196 | struct rb_root rb_root; | ||
197 | spinlock_t lock; | ||
198 | }; | ||
199 | |||
200 | struct mem_cgroup_tree_per_node { | ||
201 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
202 | }; | ||
203 | |||
204 | struct mem_cgroup_tree { | ||
205 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
206 | }; | ||
207 | |||
208 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
209 | |||
210 | struct mem_cgroup_threshold { | 171 | struct mem_cgroup_threshold { |
211 | struct eventfd_ctx *eventfd; | 172 | struct eventfd_ctx *eventfd; |
212 | u64 threshold; | 173 | u64 threshold; |
@@ -280,6 +241,7 @@ struct mem_cgroup { | |||
280 | 241 | ||
281 | bool oom_lock; | 242 | bool oom_lock; |
282 | atomic_t under_oom; | 243 | atomic_t under_oom; |
244 | atomic_t oom_wakeups; | ||
283 | 245 | ||
284 | int swappiness; | 246 | int swappiness; |
285 | /* OOM-Killer disable */ | 247 | /* OOM-Killer disable */ |
@@ -304,7 +266,7 @@ struct mem_cgroup { | |||
304 | * Should we move charges of a task when a task is moved into this | 266 | * Should we move charges of a task when a task is moved into this |
305 | * mem_cgroup ? And what type of charges should we move ? | 267 | * mem_cgroup ? And what type of charges should we move ? |
306 | */ | 268 | */ |
307 | unsigned long move_charge_at_immigrate; | 269 | unsigned long move_charge_at_immigrate; |
308 | /* | 270 | /* |
309 | * set > 0 if pages under this cgroup are moving to other cgroup. | 271 | * set > 0 if pages under this cgroup are moving to other cgroup. |
310 | */ | 272 | */ |
@@ -341,6 +303,22 @@ struct mem_cgroup { | |||
341 | atomic_t numainfo_events; | 303 | atomic_t numainfo_events; |
342 | atomic_t numainfo_updating; | 304 | atomic_t numainfo_updating; |
343 | #endif | 305 | #endif |
306 | /* | ||
307 | * Protects soft_contributed transitions. | ||
308 | * See mem_cgroup_update_soft_limit | ||
309 | */ | ||
310 | spinlock_t soft_lock; | ||
311 | |||
312 | /* | ||
313 | * If true then this group has increased parents' children_in_excess | ||
314 | * when it got over the soft limit. | ||
315 | * When a group falls bellow the soft limit, parents' children_in_excess | ||
316 | * is decreased and soft_contributed changed to false. | ||
317 | */ | ||
318 | bool soft_contributed; | ||
319 | |||
320 | /* Number of children that are in soft limit excess */ | ||
321 | atomic_t children_in_excess; | ||
344 | 322 | ||
345 | struct mem_cgroup_per_node *nodeinfo[0]; | 323 | struct mem_cgroup_per_node *nodeinfo[0]; |
346 | /* WARNING: nodeinfo must be the last member here */ | 324 | /* WARNING: nodeinfo must be the last member here */ |
@@ -444,7 +422,6 @@ static bool move_file(void) | |||
444 | * limit reclaim to prevent infinite loops, if they ever occur. | 422 | * limit reclaim to prevent infinite loops, if they ever occur. |
445 | */ | 423 | */ |
446 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 | 424 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
447 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 | ||
448 | 425 | ||
449 | enum charge_type { | 426 | enum charge_type { |
450 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 427 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
@@ -671,164 +648,6 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) | |||
671 | return mem_cgroup_zoneinfo(memcg, nid, zid); | 648 | return mem_cgroup_zoneinfo(memcg, nid, zid); |
672 | } | 649 | } |
673 | 650 | ||
674 | static struct mem_cgroup_tree_per_zone * | ||
675 | soft_limit_tree_node_zone(int nid, int zid) | ||
676 | { | ||
677 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
678 | } | ||
679 | |||
680 | static struct mem_cgroup_tree_per_zone * | ||
681 | soft_limit_tree_from_page(struct page *page) | ||
682 | { | ||
683 | int nid = page_to_nid(page); | ||
684 | int zid = page_zonenum(page); | ||
685 | |||
686 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
687 | } | ||
688 | |||
689 | static void | ||
690 | __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, | ||
691 | struct mem_cgroup_per_zone *mz, | ||
692 | struct mem_cgroup_tree_per_zone *mctz, | ||
693 | unsigned long long new_usage_in_excess) | ||
694 | { | ||
695 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
696 | struct rb_node *parent = NULL; | ||
697 | struct mem_cgroup_per_zone *mz_node; | ||
698 | |||
699 | if (mz->on_tree) | ||
700 | return; | ||
701 | |||
702 | mz->usage_in_excess = new_usage_in_excess; | ||
703 | if (!mz->usage_in_excess) | ||
704 | return; | ||
705 | while (*p) { | ||
706 | parent = *p; | ||
707 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
708 | tree_node); | ||
709 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
710 | p = &(*p)->rb_left; | ||
711 | /* | ||
712 | * We can't avoid mem cgroups that are over their soft | ||
713 | * limit by the same amount | ||
714 | */ | ||
715 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
716 | p = &(*p)->rb_right; | ||
717 | } | ||
718 | rb_link_node(&mz->tree_node, parent, p); | ||
719 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
720 | mz->on_tree = true; | ||
721 | } | ||
722 | |||
723 | static void | ||
724 | __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
725 | struct mem_cgroup_per_zone *mz, | ||
726 | struct mem_cgroup_tree_per_zone *mctz) | ||
727 | { | ||
728 | if (!mz->on_tree) | ||
729 | return; | ||
730 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
731 | mz->on_tree = false; | ||
732 | } | ||
733 | |||
734 | static void | ||
735 | mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, | ||
736 | struct mem_cgroup_per_zone *mz, | ||
737 | struct mem_cgroup_tree_per_zone *mctz) | ||
738 | { | ||
739 | spin_lock(&mctz->lock); | ||
740 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
741 | spin_unlock(&mctz->lock); | ||
742 | } | ||
743 | |||
744 | |||
745 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | ||
746 | { | ||
747 | unsigned long long excess; | ||
748 | struct mem_cgroup_per_zone *mz; | ||
749 | struct mem_cgroup_tree_per_zone *mctz; | ||
750 | int nid = page_to_nid(page); | ||
751 | int zid = page_zonenum(page); | ||
752 | mctz = soft_limit_tree_from_page(page); | ||
753 | |||
754 | /* | ||
755 | * Necessary to update all ancestors when hierarchy is used. | ||
756 | * because their event counter is not touched. | ||
757 | */ | ||
758 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | ||
759 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
760 | excess = res_counter_soft_limit_excess(&memcg->res); | ||
761 | /* | ||
762 | * We have to update the tree if mz is on RB-tree or | ||
763 | * mem is over its softlimit. | ||
764 | */ | ||
765 | if (excess || mz->on_tree) { | ||
766 | spin_lock(&mctz->lock); | ||
767 | /* if on-tree, remove it */ | ||
768 | if (mz->on_tree) | ||
769 | __mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
770 | /* | ||
771 | * Insert again. mz->usage_in_excess will be updated. | ||
772 | * If excess is 0, no tree ops. | ||
773 | */ | ||
774 | __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); | ||
775 | spin_unlock(&mctz->lock); | ||
776 | } | ||
777 | } | ||
778 | } | ||
779 | |||
780 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | ||
781 | { | ||
782 | int node, zone; | ||
783 | struct mem_cgroup_per_zone *mz; | ||
784 | struct mem_cgroup_tree_per_zone *mctz; | ||
785 | |||
786 | for_each_node(node) { | ||
787 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
788 | mz = mem_cgroup_zoneinfo(memcg, node, zone); | ||
789 | mctz = soft_limit_tree_node_zone(node, zone); | ||
790 | mem_cgroup_remove_exceeded(memcg, mz, mctz); | ||
791 | } | ||
792 | } | ||
793 | } | ||
794 | |||
795 | static struct mem_cgroup_per_zone * | ||
796 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
797 | { | ||
798 | struct rb_node *rightmost = NULL; | ||
799 | struct mem_cgroup_per_zone *mz; | ||
800 | |||
801 | retry: | ||
802 | mz = NULL; | ||
803 | rightmost = rb_last(&mctz->rb_root); | ||
804 | if (!rightmost) | ||
805 | goto done; /* Nothing to reclaim from */ | ||
806 | |||
807 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
808 | /* | ||
809 | * Remove the node now but someone else can add it back, | ||
810 | * we will to add it back at the end of reclaim to its correct | ||
811 | * position in the tree. | ||
812 | */ | ||
813 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
814 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || | ||
815 | !css_tryget(&mz->memcg->css)) | ||
816 | goto retry; | ||
817 | done: | ||
818 | return mz; | ||
819 | } | ||
820 | |||
821 | static struct mem_cgroup_per_zone * | ||
822 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
823 | { | ||
824 | struct mem_cgroup_per_zone *mz; | ||
825 | |||
826 | spin_lock(&mctz->lock); | ||
827 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
828 | spin_unlock(&mctz->lock); | ||
829 | return mz; | ||
830 | } | ||
831 | |||
832 | /* | 651 | /* |
833 | * Implementation Note: reading percpu statistics for memcg. | 652 | * Implementation Note: reading percpu statistics for memcg. |
834 | * | 653 | * |
@@ -1003,6 +822,48 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | |||
1003 | } | 822 | } |
1004 | 823 | ||
1005 | /* | 824 | /* |
825 | * Called from rate-limited memcg_check_events when enough | ||
826 | * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure | ||
827 | * that all the parents up the hierarchy will be notified that this group | ||
828 | * is in excess or that it is not in excess anymore. mmecg->soft_contributed | ||
829 | * makes the transition a single action whenever the state flips from one to | ||
830 | * the other. | ||
831 | */ | ||
832 | static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg) | ||
833 | { | ||
834 | unsigned long long excess = res_counter_soft_limit_excess(&memcg->res); | ||
835 | struct mem_cgroup *parent = memcg; | ||
836 | int delta = 0; | ||
837 | |||
838 | spin_lock(&memcg->soft_lock); | ||
839 | if (excess) { | ||
840 | if (!memcg->soft_contributed) { | ||
841 | delta = 1; | ||
842 | memcg->soft_contributed = true; | ||
843 | } | ||
844 | } else { | ||
845 | if (memcg->soft_contributed) { | ||
846 | delta = -1; | ||
847 | memcg->soft_contributed = false; | ||
848 | } | ||
849 | } | ||
850 | |||
851 | /* | ||
852 | * Necessary to update all ancestors when hierarchy is used | ||
853 | * because their event counter is not touched. | ||
854 | * We track children even outside the hierarchy for the root | ||
855 | * cgroup because tree walk starting at root should visit | ||
856 | * all cgroups and we want to prevent from pointless tree | ||
857 | * walk if no children is below the limit. | ||
858 | */ | ||
859 | while (delta && (parent = parent_mem_cgroup(parent))) | ||
860 | atomic_add(delta, &parent->children_in_excess); | ||
861 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
862 | atomic_add(delta, &root_mem_cgroup->children_in_excess); | ||
863 | spin_unlock(&memcg->soft_lock); | ||
864 | } | ||
865 | |||
866 | /* | ||
1006 | * Check events in order. | 867 | * Check events in order. |
1007 | * | 868 | * |
1008 | */ | 869 | */ |
@@ -1025,7 +886,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
1025 | 886 | ||
1026 | mem_cgroup_threshold(memcg); | 887 | mem_cgroup_threshold(memcg); |
1027 | if (unlikely(do_softlimit)) | 888 | if (unlikely(do_softlimit)) |
1028 | mem_cgroup_update_tree(memcg, page); | 889 | mem_cgroup_update_soft_limit(memcg); |
1029 | #if MAX_NUMNODES > 1 | 890 | #if MAX_NUMNODES > 1 |
1030 | if (unlikely(do_numainfo)) | 891 | if (unlikely(do_numainfo)) |
1031 | atomic_inc(&memcg->numainfo_events); | 892 | atomic_inc(&memcg->numainfo_events); |
@@ -1068,6 +929,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
1068 | return memcg; | 929 | return memcg; |
1069 | } | 930 | } |
1070 | 931 | ||
932 | static enum mem_cgroup_filter_t | ||
933 | mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root, | ||
934 | mem_cgroup_iter_filter cond) | ||
935 | { | ||
936 | if (!cond) | ||
937 | return VISIT; | ||
938 | return cond(memcg, root); | ||
939 | } | ||
940 | |||
1071 | /* | 941 | /* |
1072 | * Returns a next (in a pre-order walk) alive memcg (with elevated css | 942 | * Returns a next (in a pre-order walk) alive memcg (with elevated css |
1073 | * ref. count) or NULL if the whole root's subtree has been visited. | 943 | * ref. count) or NULL if the whole root's subtree has been visited. |
@@ -1075,7 +945,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
1075 | * helper function to be used by mem_cgroup_iter | 945 | * helper function to be used by mem_cgroup_iter |
1076 | */ | 946 | */ |
1077 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, | 947 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, |
1078 | struct mem_cgroup *last_visited) | 948 | struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) |
1079 | { | 949 | { |
1080 | struct cgroup_subsys_state *prev_css, *next_css; | 950 | struct cgroup_subsys_state *prev_css, *next_css; |
1081 | 951 | ||
@@ -1093,11 +963,31 @@ skip_node: | |||
1093 | if (next_css) { | 963 | if (next_css) { |
1094 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); | 964 | struct mem_cgroup *mem = mem_cgroup_from_css(next_css); |
1095 | 965 | ||
1096 | if (css_tryget(&mem->css)) | 966 | switch (mem_cgroup_filter(mem, root, cond)) { |
1097 | return mem; | 967 | case SKIP: |
1098 | else { | ||
1099 | prev_css = next_css; | 968 | prev_css = next_css; |
1100 | goto skip_node; | 969 | goto skip_node; |
970 | case SKIP_TREE: | ||
971 | if (mem == root) | ||
972 | return NULL; | ||
973 | /* | ||
974 | * css_rightmost_descendant is not an optimal way to | ||
975 | * skip through a subtree (especially for imbalanced | ||
976 | * trees leaning to right) but that's what we have right | ||
977 | * now. More effective solution would be traversing | ||
978 | * right-up for first non-NULL without calling | ||
979 | * css_next_descendant_pre afterwards. | ||
980 | */ | ||
981 | prev_css = css_rightmost_descendant(next_css); | ||
982 | goto skip_node; | ||
983 | case VISIT: | ||
984 | if (css_tryget(&mem->css)) | ||
985 | return mem; | ||
986 | else { | ||
987 | prev_css = next_css; | ||
988 | goto skip_node; | ||
989 | } | ||
990 | break; | ||
1101 | } | 991 | } |
1102 | } | 992 | } |
1103 | 993 | ||
@@ -1161,6 +1051,7 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
1161 | * @root: hierarchy root | 1051 | * @root: hierarchy root |
1162 | * @prev: previously returned memcg, NULL on first invocation | 1052 | * @prev: previously returned memcg, NULL on first invocation |
1163 | * @reclaim: cookie for shared reclaim walks, NULL for full walks | 1053 | * @reclaim: cookie for shared reclaim walks, NULL for full walks |
1054 | * @cond: filter for visited nodes, NULL for no filter | ||
1164 | * | 1055 | * |
1165 | * Returns references to children of the hierarchy below @root, or | 1056 | * Returns references to children of the hierarchy below @root, or |
1166 | * @root itself, or %NULL after a full round-trip. | 1057 | * @root itself, or %NULL after a full round-trip. |
@@ -1173,15 +1064,18 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | |||
1173 | * divide up the memcgs in the hierarchy among all concurrent | 1064 | * divide up the memcgs in the hierarchy among all concurrent |
1174 | * reclaimers operating on the same zone and priority. | 1065 | * reclaimers operating on the same zone and priority. |
1175 | */ | 1066 | */ |
1176 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | 1067 | struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, |
1177 | struct mem_cgroup *prev, | 1068 | struct mem_cgroup *prev, |
1178 | struct mem_cgroup_reclaim_cookie *reclaim) | 1069 | struct mem_cgroup_reclaim_cookie *reclaim, |
1070 | mem_cgroup_iter_filter cond) | ||
1179 | { | 1071 | { |
1180 | struct mem_cgroup *memcg = NULL; | 1072 | struct mem_cgroup *memcg = NULL; |
1181 | struct mem_cgroup *last_visited = NULL; | 1073 | struct mem_cgroup *last_visited = NULL; |
1182 | 1074 | ||
1183 | if (mem_cgroup_disabled()) | 1075 | if (mem_cgroup_disabled()) { |
1184 | return NULL; | 1076 | /* first call must return non-NULL, second return NULL */ |
1077 | return (struct mem_cgroup *)(unsigned long)!prev; | ||
1078 | } | ||
1185 | 1079 | ||
1186 | if (!root) | 1080 | if (!root) |
1187 | root = root_mem_cgroup; | 1081 | root = root_mem_cgroup; |
@@ -1192,7 +1086,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1192 | if (!root->use_hierarchy && root != root_mem_cgroup) { | 1086 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
1193 | if (prev) | 1087 | if (prev) |
1194 | goto out_css_put; | 1088 | goto out_css_put; |
1195 | return root; | 1089 | if (mem_cgroup_filter(root, root, cond) == VISIT) |
1090 | return root; | ||
1091 | return NULL; | ||
1196 | } | 1092 | } |
1197 | 1093 | ||
1198 | rcu_read_lock(); | 1094 | rcu_read_lock(); |
@@ -1215,7 +1111,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1215 | last_visited = mem_cgroup_iter_load(iter, root, &seq); | 1111 | last_visited = mem_cgroup_iter_load(iter, root, &seq); |
1216 | } | 1112 | } |
1217 | 1113 | ||
1218 | memcg = __mem_cgroup_iter_next(root, last_visited); | 1114 | memcg = __mem_cgroup_iter_next(root, last_visited, cond); |
1219 | 1115 | ||
1220 | if (reclaim) { | 1116 | if (reclaim) { |
1221 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); | 1117 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); |
@@ -1226,7 +1122,11 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1226 | reclaim->generation = iter->generation; | 1122 | reclaim->generation = iter->generation; |
1227 | } | 1123 | } |
1228 | 1124 | ||
1229 | if (prev && !memcg) | 1125 | /* |
1126 | * We have finished the whole tree walk or no group has been | ||
1127 | * visited because filter told us to skip the root node. | ||
1128 | */ | ||
1129 | if (!memcg && (prev || (cond && !last_visited))) | ||
1230 | goto out_unlock; | 1130 | goto out_unlock; |
1231 | } | 1131 | } |
1232 | out_unlock: | 1132 | out_unlock: |
@@ -1867,6 +1767,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
1867 | return total; | 1767 | return total; |
1868 | } | 1768 | } |
1869 | 1769 | ||
1770 | #if MAX_NUMNODES > 1 | ||
1870 | /** | 1771 | /** |
1871 | * test_mem_cgroup_node_reclaimable | 1772 | * test_mem_cgroup_node_reclaimable |
1872 | * @memcg: the target memcg | 1773 | * @memcg: the target memcg |
@@ -1889,7 +1790,6 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | |||
1889 | return false; | 1790 | return false; |
1890 | 1791 | ||
1891 | } | 1792 | } |
1892 | #if MAX_NUMNODES > 1 | ||
1893 | 1793 | ||
1894 | /* | 1794 | /* |
1895 | * Always updating the nodemask is not very good - even if we have an empty | 1795 | * Always updating the nodemask is not very good - even if we have an empty |
@@ -1957,115 +1857,64 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1957 | return node; | 1857 | return node; |
1958 | } | 1858 | } |
1959 | 1859 | ||
1960 | /* | ||
1961 | * Check all nodes whether it contains reclaimable pages or not. | ||
1962 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
1963 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
1964 | * enough new information. We need to do double check. | ||
1965 | */ | ||
1966 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
1967 | { | ||
1968 | int nid; | ||
1969 | |||
1970 | /* | ||
1971 | * quick check...making use of scan_node. | ||
1972 | * We can skip unused nodes. | ||
1973 | */ | ||
1974 | if (!nodes_empty(memcg->scan_nodes)) { | ||
1975 | for (nid = first_node(memcg->scan_nodes); | ||
1976 | nid < MAX_NUMNODES; | ||
1977 | nid = next_node(nid, memcg->scan_nodes)) { | ||
1978 | |||
1979 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1980 | return true; | ||
1981 | } | ||
1982 | } | ||
1983 | /* | ||
1984 | * Check rest of nodes. | ||
1985 | */ | ||
1986 | for_each_node_state(nid, N_MEMORY) { | ||
1987 | if (node_isset(nid, memcg->scan_nodes)) | ||
1988 | continue; | ||
1989 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1990 | return true; | ||
1991 | } | ||
1992 | return false; | ||
1993 | } | ||
1994 | |||
1995 | #else | 1860 | #else |
1996 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 1861 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1997 | { | 1862 | { |
1998 | return 0; | 1863 | return 0; |
1999 | } | 1864 | } |
2000 | 1865 | ||
2001 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
2002 | { | ||
2003 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); | ||
2004 | } | ||
2005 | #endif | 1866 | #endif |
2006 | 1867 | ||
2007 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | 1868 | /* |
2008 | struct zone *zone, | 1869 | * A group is eligible for the soft limit reclaim under the given root |
2009 | gfp_t gfp_mask, | 1870 | * hierarchy if |
2010 | unsigned long *total_scanned) | 1871 | * a) it is over its soft limit |
2011 | { | 1872 | * b) any parent up the hierarchy is over its soft limit |
2012 | struct mem_cgroup *victim = NULL; | 1873 | * |
2013 | int total = 0; | 1874 | * If the given group doesn't have any children over the limit then it |
2014 | int loop = 0; | 1875 | * doesn't make any sense to iterate its subtree. |
2015 | unsigned long excess; | 1876 | */ |
2016 | unsigned long nr_scanned; | 1877 | enum mem_cgroup_filter_t |
2017 | struct mem_cgroup_reclaim_cookie reclaim = { | 1878 | mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg, |
2018 | .zone = zone, | 1879 | struct mem_cgroup *root) |
2019 | .priority = 0, | 1880 | { |
2020 | }; | 1881 | struct mem_cgroup *parent; |
2021 | 1882 | ||
2022 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; | 1883 | if (!memcg) |
2023 | 1884 | memcg = root_mem_cgroup; | |
2024 | while (1) { | 1885 | parent = memcg; |
2025 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); | 1886 | |
2026 | if (!victim) { | 1887 | if (res_counter_soft_limit_excess(&memcg->res)) |
2027 | loop++; | 1888 | return VISIT; |
2028 | if (loop >= 2) { | 1889 | |
2029 | /* | 1890 | /* |
2030 | * If we have not been able to reclaim | 1891 | * If any parent up to the root in the hierarchy is over its soft limit |
2031 | * anything, it might because there are | 1892 | * then we have to obey and reclaim from this group as well. |
2032 | * no reclaimable pages under this hierarchy | 1893 | */ |
2033 | */ | 1894 | while ((parent = parent_mem_cgroup(parent))) { |
2034 | if (!total) | 1895 | if (res_counter_soft_limit_excess(&parent->res)) |
2035 | break; | 1896 | return VISIT; |
2036 | /* | 1897 | if (parent == root) |
2037 | * We want to do more targeted reclaim. | ||
2038 | * excess >> 2 is not to excessive so as to | ||
2039 | * reclaim too much, nor too less that we keep | ||
2040 | * coming back to reclaim from this cgroup | ||
2041 | */ | ||
2042 | if (total >= (excess >> 2) || | ||
2043 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) | ||
2044 | break; | ||
2045 | } | ||
2046 | continue; | ||
2047 | } | ||
2048 | if (!mem_cgroup_reclaimable(victim, false)) | ||
2049 | continue; | ||
2050 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, | ||
2051 | zone, &nr_scanned); | ||
2052 | *total_scanned += nr_scanned; | ||
2053 | if (!res_counter_soft_limit_excess(&root_memcg->res)) | ||
2054 | break; | 1898 | break; |
2055 | } | 1899 | } |
2056 | mem_cgroup_iter_break(root_memcg, victim); | 1900 | |
2057 | return total; | 1901 | if (!atomic_read(&memcg->children_in_excess)) |
1902 | return SKIP_TREE; | ||
1903 | return SKIP; | ||
2058 | } | 1904 | } |
2059 | 1905 | ||
1906 | static DEFINE_SPINLOCK(memcg_oom_lock); | ||
1907 | |||
2060 | /* | 1908 | /* |
2061 | * Check OOM-Killer is already running under our hierarchy. | 1909 | * Check OOM-Killer is already running under our hierarchy. |
2062 | * If someone is running, return false. | 1910 | * If someone is running, return false. |
2063 | * Has to be called with memcg_oom_lock | ||
2064 | */ | 1911 | */ |
2065 | static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) | 1912 | static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) |
2066 | { | 1913 | { |
2067 | struct mem_cgroup *iter, *failed = NULL; | 1914 | struct mem_cgroup *iter, *failed = NULL; |
2068 | 1915 | ||
1916 | spin_lock(&memcg_oom_lock); | ||
1917 | |||
2069 | for_each_mem_cgroup_tree(iter, memcg) { | 1918 | for_each_mem_cgroup_tree(iter, memcg) { |
2070 | if (iter->oom_lock) { | 1919 | if (iter->oom_lock) { |
2071 | /* | 1920 | /* |
@@ -2079,33 +1928,33 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) | |||
2079 | iter->oom_lock = true; | 1928 | iter->oom_lock = true; |
2080 | } | 1929 | } |
2081 | 1930 | ||
2082 | if (!failed) | 1931 | if (failed) { |
2083 | return true; | 1932 | /* |
2084 | 1933 | * OK, we failed to lock the whole subtree so we have | |
2085 | /* | 1934 | * to clean up what we set up to the failing subtree |
2086 | * OK, we failed to lock the whole subtree so we have to clean up | 1935 | */ |
2087 | * what we set up to the failing subtree | 1936 | for_each_mem_cgroup_tree(iter, memcg) { |
2088 | */ | 1937 | if (iter == failed) { |
2089 | for_each_mem_cgroup_tree(iter, memcg) { | 1938 | mem_cgroup_iter_break(memcg, iter); |
2090 | if (iter == failed) { | 1939 | break; |
2091 | mem_cgroup_iter_break(memcg, iter); | 1940 | } |
2092 | break; | 1941 | iter->oom_lock = false; |
2093 | } | 1942 | } |
2094 | iter->oom_lock = false; | ||
2095 | } | 1943 | } |
2096 | return false; | 1944 | |
1945 | spin_unlock(&memcg_oom_lock); | ||
1946 | |||
1947 | return !failed; | ||
2097 | } | 1948 | } |
2098 | 1949 | ||
2099 | /* | 1950 | static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) |
2100 | * Has to be called with memcg_oom_lock | ||
2101 | */ | ||
2102 | static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) | ||
2103 | { | 1951 | { |
2104 | struct mem_cgroup *iter; | 1952 | struct mem_cgroup *iter; |
2105 | 1953 | ||
1954 | spin_lock(&memcg_oom_lock); | ||
2106 | for_each_mem_cgroup_tree(iter, memcg) | 1955 | for_each_mem_cgroup_tree(iter, memcg) |
2107 | iter->oom_lock = false; | 1956 | iter->oom_lock = false; |
2108 | return 0; | 1957 | spin_unlock(&memcg_oom_lock); |
2109 | } | 1958 | } |
2110 | 1959 | ||
2111 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) | 1960 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) |
@@ -2129,7 +1978,6 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) | |||
2129 | atomic_add_unless(&iter->under_oom, -1, 0); | 1978 | atomic_add_unless(&iter->under_oom, -1, 0); |
2130 | } | 1979 | } |
2131 | 1980 | ||
2132 | static DEFINE_SPINLOCK(memcg_oom_lock); | ||
2133 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1981 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
2134 | 1982 | ||
2135 | struct oom_wait_info { | 1983 | struct oom_wait_info { |
@@ -2159,6 +2007,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait, | |||
2159 | 2007 | ||
2160 | static void memcg_wakeup_oom(struct mem_cgroup *memcg) | 2008 | static void memcg_wakeup_oom(struct mem_cgroup *memcg) |
2161 | { | 2009 | { |
2010 | atomic_inc(&memcg->oom_wakeups); | ||
2162 | /* for filtering, pass "memcg" as argument. */ | 2011 | /* for filtering, pass "memcg" as argument. */ |
2163 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); | 2012 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); |
2164 | } | 2013 | } |
@@ -2170,56 +2019,136 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
2170 | } | 2019 | } |
2171 | 2020 | ||
2172 | /* | 2021 | /* |
2173 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 2022 | * try to call OOM killer |
2174 | */ | 2023 | */ |
2175 | static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, | 2024 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
2176 | int order) | ||
2177 | { | 2025 | { |
2178 | struct oom_wait_info owait; | 2026 | bool locked; |
2179 | bool locked, need_to_kill; | 2027 | int wakeups; |
2180 | 2028 | ||
2181 | owait.memcg = memcg; | 2029 | if (!current->memcg_oom.may_oom) |
2182 | owait.wait.flags = 0; | 2030 | return; |
2183 | owait.wait.func = memcg_oom_wake_function; | 2031 | |
2184 | owait.wait.private = current; | 2032 | current->memcg_oom.in_memcg_oom = 1; |
2185 | INIT_LIST_HEAD(&owait.wait.task_list); | ||
2186 | need_to_kill = true; | ||
2187 | mem_cgroup_mark_under_oom(memcg); | ||
2188 | 2033 | ||
2189 | /* At first, try to OOM lock hierarchy under memcg.*/ | ||
2190 | spin_lock(&memcg_oom_lock); | ||
2191 | locked = mem_cgroup_oom_lock(memcg); | ||
2192 | /* | 2034 | /* |
2193 | * Even if signal_pending(), we can't quit charge() loop without | 2035 | * As with any blocking lock, a contender needs to start |
2194 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | 2036 | * listening for wakeups before attempting the trylock, |
2195 | * under OOM is always welcomed, use TASK_KILLABLE here. | 2037 | * otherwise it can miss the wakeup from the unlock and sleep |
2038 | * indefinitely. This is just open-coded because our locking | ||
2039 | * is so particular to memcg hierarchies. | ||
2196 | */ | 2040 | */ |
2197 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 2041 | wakeups = atomic_read(&memcg->oom_wakeups); |
2198 | if (!locked || memcg->oom_kill_disable) | 2042 | mem_cgroup_mark_under_oom(memcg); |
2199 | need_to_kill = false; | 2043 | |
2044 | locked = mem_cgroup_oom_trylock(memcg); | ||
2045 | |||
2200 | if (locked) | 2046 | if (locked) |
2201 | mem_cgroup_oom_notify(memcg); | 2047 | mem_cgroup_oom_notify(memcg); |
2202 | spin_unlock(&memcg_oom_lock); | ||
2203 | 2048 | ||
2204 | if (need_to_kill) { | 2049 | if (locked && !memcg->oom_kill_disable) { |
2205 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2050 | mem_cgroup_unmark_under_oom(memcg); |
2206 | mem_cgroup_out_of_memory(memcg, mask, order); | 2051 | mem_cgroup_out_of_memory(memcg, mask, order); |
2052 | mem_cgroup_oom_unlock(memcg); | ||
2053 | /* | ||
2054 | * There is no guarantee that an OOM-lock contender | ||
2055 | * sees the wakeups triggered by the OOM kill | ||
2056 | * uncharges. Wake any sleepers explicitely. | ||
2057 | */ | ||
2058 | memcg_oom_recover(memcg); | ||
2207 | } else { | 2059 | } else { |
2208 | schedule(); | 2060 | /* |
2209 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2061 | * A system call can just return -ENOMEM, but if this |
2062 | * is a page fault and somebody else is handling the | ||
2063 | * OOM already, we need to sleep on the OOM waitqueue | ||
2064 | * for this memcg until the situation is resolved. | ||
2065 | * Which can take some time because it might be | ||
2066 | * handled by a userspace task. | ||
2067 | * | ||
2068 | * However, this is the charge context, which means | ||
2069 | * that we may sit on a large call stack and hold | ||
2070 | * various filesystem locks, the mmap_sem etc. and we | ||
2071 | * don't want the OOM handler to deadlock on them | ||
2072 | * while we sit here and wait. Store the current OOM | ||
2073 | * context in the task_struct, then return -ENOMEM. | ||
2074 | * At the end of the page fault handler, with the | ||
2075 | * stack unwound, pagefault_out_of_memory() will check | ||
2076 | * back with us by calling | ||
2077 | * mem_cgroup_oom_synchronize(), possibly putting the | ||
2078 | * task to sleep. | ||
2079 | */ | ||
2080 | current->memcg_oom.oom_locked = locked; | ||
2081 | current->memcg_oom.wakeups = wakeups; | ||
2082 | css_get(&memcg->css); | ||
2083 | current->memcg_oom.wait_on_memcg = memcg; | ||
2210 | } | 2084 | } |
2211 | spin_lock(&memcg_oom_lock); | 2085 | } |
2212 | if (locked) | ||
2213 | mem_cgroup_oom_unlock(memcg); | ||
2214 | memcg_wakeup_oom(memcg); | ||
2215 | spin_unlock(&memcg_oom_lock); | ||
2216 | 2086 | ||
2217 | mem_cgroup_unmark_under_oom(memcg); | 2087 | /** |
2088 | * mem_cgroup_oom_synchronize - complete memcg OOM handling | ||
2089 | * | ||
2090 | * This has to be called at the end of a page fault if the the memcg | ||
2091 | * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. | ||
2092 | * | ||
2093 | * Memcg supports userspace OOM handling, so failed allocations must | ||
2094 | * sleep on a waitqueue until the userspace task resolves the | ||
2095 | * situation. Sleeping directly in the charge context with all kinds | ||
2096 | * of locks held is not a good idea, instead we remember an OOM state | ||
2097 | * in the task and mem_cgroup_oom_synchronize() has to be called at | ||
2098 | * the end of the page fault to put the task to sleep and clean up the | ||
2099 | * OOM state. | ||
2100 | * | ||
2101 | * Returns %true if an ongoing memcg OOM situation was detected and | ||
2102 | * finalized, %false otherwise. | ||
2103 | */ | ||
2104 | bool mem_cgroup_oom_synchronize(void) | ||
2105 | { | ||
2106 | struct oom_wait_info owait; | ||
2107 | struct mem_cgroup *memcg; | ||
2218 | 2108 | ||
2219 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 2109 | /* OOM is global, do not handle */ |
2110 | if (!current->memcg_oom.in_memcg_oom) | ||
2220 | return false; | 2111 | return false; |
2221 | /* Give chance to dying process */ | 2112 | |
2222 | schedule_timeout_uninterruptible(1); | 2113 | /* |
2114 | * We invoked the OOM killer but there is a chance that a kill | ||
2115 | * did not free up any charges. Everybody else might already | ||
2116 | * be sleeping, so restart the fault and keep the rampage | ||
2117 | * going until some charges are released. | ||
2118 | */ | ||
2119 | memcg = current->memcg_oom.wait_on_memcg; | ||
2120 | if (!memcg) | ||
2121 | goto out; | ||
2122 | |||
2123 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | ||
2124 | goto out_memcg; | ||
2125 | |||
2126 | owait.memcg = memcg; | ||
2127 | owait.wait.flags = 0; | ||
2128 | owait.wait.func = memcg_oom_wake_function; | ||
2129 | owait.wait.private = current; | ||
2130 | INIT_LIST_HEAD(&owait.wait.task_list); | ||
2131 | |||
2132 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | ||
2133 | /* Only sleep if we didn't miss any wakeups since OOM */ | ||
2134 | if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) | ||
2135 | schedule(); | ||
2136 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
2137 | out_memcg: | ||
2138 | mem_cgroup_unmark_under_oom(memcg); | ||
2139 | if (current->memcg_oom.oom_locked) { | ||
2140 | mem_cgroup_oom_unlock(memcg); | ||
2141 | /* | ||
2142 | * There is no guarantee that an OOM-lock contender | ||
2143 | * sees the wakeups triggered by the OOM kill | ||
2144 | * uncharges. Wake any sleepers explicitely. | ||
2145 | */ | ||
2146 | memcg_oom_recover(memcg); | ||
2147 | } | ||
2148 | css_put(&memcg->css); | ||
2149 | current->memcg_oom.wait_on_memcg = NULL; | ||
2150 | out: | ||
2151 | current->memcg_oom.in_memcg_oom = 0; | ||
2223 | return true; | 2152 | return true; |
2224 | } | 2153 | } |
2225 | 2154 | ||
@@ -2288,7 +2217,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) | |||
2288 | } | 2217 | } |
2289 | 2218 | ||
2290 | void mem_cgroup_update_page_stat(struct page *page, | 2219 | void mem_cgroup_update_page_stat(struct page *page, |
2291 | enum mem_cgroup_page_stat_item idx, int val) | 2220 | enum mem_cgroup_stat_index idx, int val) |
2292 | { | 2221 | { |
2293 | struct mem_cgroup *memcg; | 2222 | struct mem_cgroup *memcg; |
2294 | struct page_cgroup *pc = lookup_page_cgroup(page); | 2223 | struct page_cgroup *pc = lookup_page_cgroup(page); |
@@ -2297,18 +2226,11 @@ void mem_cgroup_update_page_stat(struct page *page, | |||
2297 | if (mem_cgroup_disabled()) | 2226 | if (mem_cgroup_disabled()) |
2298 | return; | 2227 | return; |
2299 | 2228 | ||
2229 | VM_BUG_ON(!rcu_read_lock_held()); | ||
2300 | memcg = pc->mem_cgroup; | 2230 | memcg = pc->mem_cgroup; |
2301 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | 2231 | if (unlikely(!memcg || !PageCgroupUsed(pc))) |
2302 | return; | 2232 | return; |
2303 | 2233 | ||
2304 | switch (idx) { | ||
2305 | case MEMCG_NR_FILE_MAPPED: | ||
2306 | idx = MEM_CGROUP_STAT_FILE_MAPPED; | ||
2307 | break; | ||
2308 | default: | ||
2309 | BUG(); | ||
2310 | } | ||
2311 | |||
2312 | this_cpu_add(memcg->stat->count[idx], val); | 2234 | this_cpu_add(memcg->stat->count[idx], val); |
2313 | } | 2235 | } |
2314 | 2236 | ||
@@ -2450,7 +2372,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) | |||
2450 | flush_work(&stock->work); | 2372 | flush_work(&stock->work); |
2451 | } | 2373 | } |
2452 | out: | 2374 | out: |
2453 | put_online_cpus(); | 2375 | put_online_cpus(); |
2454 | } | 2376 | } |
2455 | 2377 | ||
2456 | /* | 2378 | /* |
@@ -2532,12 +2454,11 @@ enum { | |||
2532 | CHARGE_RETRY, /* need to retry but retry is not bad */ | 2454 | CHARGE_RETRY, /* need to retry but retry is not bad */ |
2533 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ | 2455 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ |
2534 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ | 2456 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ |
2535 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ | ||
2536 | }; | 2457 | }; |
2537 | 2458 | ||
2538 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | 2459 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, |
2539 | unsigned int nr_pages, unsigned int min_pages, | 2460 | unsigned int nr_pages, unsigned int min_pages, |
2540 | bool oom_check) | 2461 | bool invoke_oom) |
2541 | { | 2462 | { |
2542 | unsigned long csize = nr_pages * PAGE_SIZE; | 2463 | unsigned long csize = nr_pages * PAGE_SIZE; |
2543 | struct mem_cgroup *mem_over_limit; | 2464 | struct mem_cgroup *mem_over_limit; |
@@ -2594,14 +2515,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2594 | if (mem_cgroup_wait_acct_move(mem_over_limit)) | 2515 | if (mem_cgroup_wait_acct_move(mem_over_limit)) |
2595 | return CHARGE_RETRY; | 2516 | return CHARGE_RETRY; |
2596 | 2517 | ||
2597 | /* If we don't need to call oom-killer at el, return immediately */ | 2518 | if (invoke_oom) |
2598 | if (!oom_check) | 2519 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); |
2599 | return CHARGE_NOMEM; | ||
2600 | /* check OOM */ | ||
2601 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) | ||
2602 | return CHARGE_OOM_DIE; | ||
2603 | 2520 | ||
2604 | return CHARGE_RETRY; | 2521 | return CHARGE_NOMEM; |
2605 | } | 2522 | } |
2606 | 2523 | ||
2607 | /* | 2524 | /* |
@@ -2704,7 +2621,7 @@ again: | |||
2704 | } | 2621 | } |
2705 | 2622 | ||
2706 | do { | 2623 | do { |
2707 | bool oom_check; | 2624 | bool invoke_oom = oom && !nr_oom_retries; |
2708 | 2625 | ||
2709 | /* If killed, bypass charge */ | 2626 | /* If killed, bypass charge */ |
2710 | if (fatal_signal_pending(current)) { | 2627 | if (fatal_signal_pending(current)) { |
@@ -2712,14 +2629,8 @@ again: | |||
2712 | goto bypass; | 2629 | goto bypass; |
2713 | } | 2630 | } |
2714 | 2631 | ||
2715 | oom_check = false; | 2632 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, |
2716 | if (oom && !nr_oom_retries) { | 2633 | nr_pages, invoke_oom); |
2717 | oom_check = true; | ||
2718 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
2719 | } | ||
2720 | |||
2721 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, | ||
2722 | oom_check); | ||
2723 | switch (ret) { | 2634 | switch (ret) { |
2724 | case CHARGE_OK: | 2635 | case CHARGE_OK: |
2725 | break; | 2636 | break; |
@@ -2732,16 +2643,12 @@ again: | |||
2732 | css_put(&memcg->css); | 2643 | css_put(&memcg->css); |
2733 | goto nomem; | 2644 | goto nomem; |
2734 | case CHARGE_NOMEM: /* OOM routine works */ | 2645 | case CHARGE_NOMEM: /* OOM routine works */ |
2735 | if (!oom) { | 2646 | if (!oom || invoke_oom) { |
2736 | css_put(&memcg->css); | 2647 | css_put(&memcg->css); |
2737 | goto nomem; | 2648 | goto nomem; |
2738 | } | 2649 | } |
2739 | /* If oom, we never return -ENOMEM */ | ||
2740 | nr_oom_retries--; | 2650 | nr_oom_retries--; |
2741 | break; | 2651 | break; |
2742 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ | ||
2743 | css_put(&memcg->css); | ||
2744 | goto bypass; | ||
2745 | } | 2652 | } |
2746 | } while (ret != CHARGE_OK); | 2653 | } while (ret != CHARGE_OK); |
2747 | 2654 | ||
@@ -2882,7 +2789,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2882 | * is accessed after testing USED bit. To make pc->mem_cgroup visible | 2789 | * is accessed after testing USED bit. To make pc->mem_cgroup visible |
2883 | * before USED bit, we need memory barrier here. | 2790 | * before USED bit, we need memory barrier here. |
2884 | * See mem_cgroup_add_lru_list(), etc. | 2791 | * See mem_cgroup_add_lru_list(), etc. |
2885 | */ | 2792 | */ |
2886 | smp_wmb(); | 2793 | smp_wmb(); |
2887 | SetPageCgroupUsed(pc); | 2794 | SetPageCgroupUsed(pc); |
2888 | 2795 | ||
@@ -2905,9 +2812,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2905 | unlock_page_cgroup(pc); | 2812 | unlock_page_cgroup(pc); |
2906 | 2813 | ||
2907 | /* | 2814 | /* |
2908 | * "charge_statistics" updated event counter. Then, check it. | 2815 | * "charge_statistics" updated event counter. |
2909 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
2910 | * if they exceeds softlimit. | ||
2911 | */ | 2816 | */ |
2912 | memcg_check_events(memcg, page); | 2817 | memcg_check_events(memcg, page); |
2913 | } | 2818 | } |
@@ -3626,9 +3531,9 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
3626 | * the page allocator. Therefore, the following sequence when backed by | 3531 | * the page allocator. Therefore, the following sequence when backed by |
3627 | * the SLUB allocator: | 3532 | * the SLUB allocator: |
3628 | * | 3533 | * |
3629 | * memcg_stop_kmem_account(); | 3534 | * memcg_stop_kmem_account(); |
3630 | * kmalloc(<large_number>) | 3535 | * kmalloc(<large_number>) |
3631 | * memcg_resume_kmem_account(); | 3536 | * memcg_resume_kmem_account(); |
3632 | * | 3537 | * |
3633 | * would effectively ignore the fact that we should skip accounting, | 3538 | * would effectively ignore the fact that we should skip accounting, |
3634 | * since it will drive us directly to this function without passing | 3539 | * since it will drive us directly to this function without passing |
@@ -3750,6 +3655,20 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
3750 | } | 3655 | } |
3751 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 3656 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
3752 | 3657 | ||
3658 | static inline | ||
3659 | void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, | ||
3660 | struct mem_cgroup *to, | ||
3661 | unsigned int nr_pages, | ||
3662 | enum mem_cgroup_stat_index idx) | ||
3663 | { | ||
3664 | /* Update stat data for mem_cgroup */ | ||
3665 | preempt_disable(); | ||
3666 | WARN_ON_ONCE(from->stat->count[idx] < nr_pages); | ||
3667 | __this_cpu_add(from->stat->count[idx], -nr_pages); | ||
3668 | __this_cpu_add(to->stat->count[idx], nr_pages); | ||
3669 | preempt_enable(); | ||
3670 | } | ||
3671 | |||
3753 | /** | 3672 | /** |
3754 | * mem_cgroup_move_account - move account of the page | 3673 | * mem_cgroup_move_account - move account of the page |
3755 | * @page: the page | 3674 | * @page: the page |
@@ -3795,13 +3714,14 @@ static int mem_cgroup_move_account(struct page *page, | |||
3795 | 3714 | ||
3796 | move_lock_mem_cgroup(from, &flags); | 3715 | move_lock_mem_cgroup(from, &flags); |
3797 | 3716 | ||
3798 | if (!anon && page_mapped(page)) { | 3717 | if (!anon && page_mapped(page)) |
3799 | /* Update mapped_file data for mem_cgroup */ | 3718 | mem_cgroup_move_account_page_stat(from, to, nr_pages, |
3800 | preempt_disable(); | 3719 | MEM_CGROUP_STAT_FILE_MAPPED); |
3801 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 3720 | |
3802 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 3721 | if (PageWriteback(page)) |
3803 | preempt_enable(); | 3722 | mem_cgroup_move_account_page_stat(from, to, nr_pages, |
3804 | } | 3723 | MEM_CGROUP_STAT_WRITEBACK); |
3724 | |||
3805 | mem_cgroup_charge_statistics(from, page, anon, -nr_pages); | 3725 | mem_cgroup_charge_statistics(from, page, anon, -nr_pages); |
3806 | 3726 | ||
3807 | /* caller should have done css_get */ | 3727 | /* caller should have done css_get */ |
@@ -4657,7 +4577,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
4657 | MEM_CGROUP_RECLAIM_SHRINK); | 4577 | MEM_CGROUP_RECLAIM_SHRINK); |
4658 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 4578 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
4659 | /* Usage is reduced ? */ | 4579 | /* Usage is reduced ? */ |
4660 | if (curusage >= oldusage) | 4580 | if (curusage >= oldusage) |
4661 | retry_count--; | 4581 | retry_count--; |
4662 | else | 4582 | else |
4663 | oldusage = curusage; | 4583 | oldusage = curusage; |
@@ -4678,7 +4598,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
4678 | int enlarge = 0; | 4598 | int enlarge = 0; |
4679 | 4599 | ||
4680 | /* see mem_cgroup_resize_res_limit */ | 4600 | /* see mem_cgroup_resize_res_limit */ |
4681 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | 4601 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; |
4682 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 4602 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
4683 | while (retry_count) { | 4603 | while (retry_count) { |
4684 | if (signal_pending(current)) { | 4604 | if (signal_pending(current)) { |
@@ -4727,98 +4647,6 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
4727 | return ret; | 4647 | return ret; |
4728 | } | 4648 | } |
4729 | 4649 | ||
4730 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
4731 | gfp_t gfp_mask, | ||
4732 | unsigned long *total_scanned) | ||
4733 | { | ||
4734 | unsigned long nr_reclaimed = 0; | ||
4735 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
4736 | unsigned long reclaimed; | ||
4737 | int loop = 0; | ||
4738 | struct mem_cgroup_tree_per_zone *mctz; | ||
4739 | unsigned long long excess; | ||
4740 | unsigned long nr_scanned; | ||
4741 | |||
4742 | if (order > 0) | ||
4743 | return 0; | ||
4744 | |||
4745 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); | ||
4746 | /* | ||
4747 | * This loop can run a while, specially if mem_cgroup's continuously | ||
4748 | * keep exceeding their soft limit and putting the system under | ||
4749 | * pressure | ||
4750 | */ | ||
4751 | do { | ||
4752 | if (next_mz) | ||
4753 | mz = next_mz; | ||
4754 | else | ||
4755 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
4756 | if (!mz) | ||
4757 | break; | ||
4758 | |||
4759 | nr_scanned = 0; | ||
4760 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, | ||
4761 | gfp_mask, &nr_scanned); | ||
4762 | nr_reclaimed += reclaimed; | ||
4763 | *total_scanned += nr_scanned; | ||
4764 | spin_lock(&mctz->lock); | ||
4765 | |||
4766 | /* | ||
4767 | * If we failed to reclaim anything from this memory cgroup | ||
4768 | * it is time to move on to the next cgroup | ||
4769 | */ | ||
4770 | next_mz = NULL; | ||
4771 | if (!reclaimed) { | ||
4772 | do { | ||
4773 | /* | ||
4774 | * Loop until we find yet another one. | ||
4775 | * | ||
4776 | * By the time we get the soft_limit lock | ||
4777 | * again, someone might have aded the | ||
4778 | * group back on the RB tree. Iterate to | ||
4779 | * make sure we get a different mem. | ||
4780 | * mem_cgroup_largest_soft_limit_node returns | ||
4781 | * NULL if no other cgroup is present on | ||
4782 | * the tree | ||
4783 | */ | ||
4784 | next_mz = | ||
4785 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
4786 | if (next_mz == mz) | ||
4787 | css_put(&next_mz->memcg->css); | ||
4788 | else /* next_mz == NULL or other memcg */ | ||
4789 | break; | ||
4790 | } while (1); | ||
4791 | } | ||
4792 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); | ||
4793 | excess = res_counter_soft_limit_excess(&mz->memcg->res); | ||
4794 | /* | ||
4795 | * One school of thought says that we should not add | ||
4796 | * back the node to the tree if reclaim returns 0. | ||
4797 | * But our reclaim could return 0, simply because due | ||
4798 | * to priority we are exposing a smaller subset of | ||
4799 | * memory to reclaim from. Consider this as a longer | ||
4800 | * term TODO. | ||
4801 | */ | ||
4802 | /* If excess == 0, no tree ops */ | ||
4803 | __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); | ||
4804 | spin_unlock(&mctz->lock); | ||
4805 | css_put(&mz->memcg->css); | ||
4806 | loop++; | ||
4807 | /* | ||
4808 | * Could not reclaim anything and there are no more | ||
4809 | * mem cgroups to try or we seem to be looping without | ||
4810 | * reclaiming anything. | ||
4811 | */ | ||
4812 | if (!nr_reclaimed && | ||
4813 | (next_mz == NULL || | ||
4814 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
4815 | break; | ||
4816 | } while (!nr_reclaimed); | ||
4817 | if (next_mz) | ||
4818 | css_put(&next_mz->memcg->css); | ||
4819 | return nr_reclaimed; | ||
4820 | } | ||
4821 | |||
4822 | /** | 4650 | /** |
4823 | * mem_cgroup_force_empty_list - clears LRU of a group | 4651 | * mem_cgroup_force_empty_list - clears LRU of a group |
4824 | * @memcg: group to clear | 4652 | * @memcg: group to clear |
@@ -4990,18 +4818,12 @@ static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, | |||
4990 | unsigned int event) | 4818 | unsigned int event) |
4991 | { | 4819 | { |
4992 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4820 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
4993 | int ret; | ||
4994 | 4821 | ||
4995 | if (mem_cgroup_is_root(memcg)) | 4822 | if (mem_cgroup_is_root(memcg)) |
4996 | return -EINVAL; | 4823 | return -EINVAL; |
4997 | css_get(&memcg->css); | 4824 | return mem_cgroup_force_empty(memcg); |
4998 | ret = mem_cgroup_force_empty(memcg); | ||
4999 | css_put(&memcg->css); | ||
5000 | |||
5001 | return ret; | ||
5002 | } | 4825 | } |
5003 | 4826 | ||
5004 | |||
5005 | static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, | 4827 | static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, |
5006 | struct cftype *cft) | 4828 | struct cftype *cft) |
5007 | { | 4829 | { |
@@ -5139,7 +4961,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) | |||
5139 | */ | 4961 | */ |
5140 | mutex_lock(&memcg_create_mutex); | 4962 | mutex_lock(&memcg_create_mutex); |
5141 | mutex_lock(&set_limit_mutex); | 4963 | mutex_lock(&set_limit_mutex); |
5142 | if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { | 4964 | if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { |
5143 | if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { | 4965 | if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { |
5144 | ret = -EBUSY; | 4966 | ret = -EBUSY; |
5145 | goto out; | 4967 | goto out; |
@@ -5149,7 +4971,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) | |||
5149 | 4971 | ||
5150 | ret = memcg_update_cache_sizes(memcg); | 4972 | ret = memcg_update_cache_sizes(memcg); |
5151 | if (ret) { | 4973 | if (ret) { |
5152 | res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); | 4974 | res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); |
5153 | goto out; | 4975 | goto out; |
5154 | } | 4976 | } |
5155 | static_key_slow_inc(&memcg_kmem_enabled_key); | 4977 | static_key_slow_inc(&memcg_kmem_enabled_key); |
@@ -6089,8 +5911,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
6089 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 5911 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
6090 | mz = &pn->zoneinfo[zone]; | 5912 | mz = &pn->zoneinfo[zone]; |
6091 | lruvec_init(&mz->lruvec); | 5913 | lruvec_init(&mz->lruvec); |
6092 | mz->usage_in_excess = 0; | ||
6093 | mz->on_tree = false; | ||
6094 | mz->memcg = memcg; | 5914 | mz->memcg = memcg; |
6095 | } | 5915 | } |
6096 | memcg->nodeinfo[node] = pn; | 5916 | memcg->nodeinfo[node] = pn; |
@@ -6146,7 +5966,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
6146 | int node; | 5966 | int node; |
6147 | size_t size = memcg_size(); | 5967 | size_t size = memcg_size(); |
6148 | 5968 | ||
6149 | mem_cgroup_remove_from_trees(memcg); | ||
6150 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 5969 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
6151 | 5970 | ||
6152 | for_each_node(node) | 5971 | for_each_node(node) |
@@ -6183,29 +6002,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
6183 | } | 6002 | } |
6184 | EXPORT_SYMBOL(parent_mem_cgroup); | 6003 | EXPORT_SYMBOL(parent_mem_cgroup); |
6185 | 6004 | ||
6186 | static void __init mem_cgroup_soft_limit_tree_init(void) | ||
6187 | { | ||
6188 | struct mem_cgroup_tree_per_node *rtpn; | ||
6189 | struct mem_cgroup_tree_per_zone *rtpz; | ||
6190 | int tmp, node, zone; | ||
6191 | |||
6192 | for_each_node(node) { | ||
6193 | tmp = node; | ||
6194 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
6195 | tmp = -1; | ||
6196 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
6197 | BUG_ON(!rtpn); | ||
6198 | |||
6199 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
6200 | |||
6201 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
6202 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
6203 | rtpz->rb_root = RB_ROOT; | ||
6204 | spin_lock_init(&rtpz->lock); | ||
6205 | } | ||
6206 | } | ||
6207 | } | ||
6208 | |||
6209 | static struct cgroup_subsys_state * __ref | 6005 | static struct cgroup_subsys_state * __ref |
6210 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 6006 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
6211 | { | 6007 | { |
@@ -6235,6 +6031,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
6235 | mutex_init(&memcg->thresholds_lock); | 6031 | mutex_init(&memcg->thresholds_lock); |
6236 | spin_lock_init(&memcg->move_lock); | 6032 | spin_lock_init(&memcg->move_lock); |
6237 | vmpressure_init(&memcg->vmpressure); | 6033 | vmpressure_init(&memcg->vmpressure); |
6034 | spin_lock_init(&memcg->soft_lock); | ||
6238 | 6035 | ||
6239 | return &memcg->css; | 6036 | return &memcg->css; |
6240 | 6037 | ||
@@ -6312,6 +6109,13 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
6312 | 6109 | ||
6313 | mem_cgroup_invalidate_reclaim_iterators(memcg); | 6110 | mem_cgroup_invalidate_reclaim_iterators(memcg); |
6314 | mem_cgroup_reparent_charges(memcg); | 6111 | mem_cgroup_reparent_charges(memcg); |
6112 | if (memcg->soft_contributed) { | ||
6113 | while ((memcg = parent_mem_cgroup(memcg))) | ||
6114 | atomic_dec(&memcg->children_in_excess); | ||
6115 | |||
6116 | if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy) | ||
6117 | atomic_dec(&root_mem_cgroup->children_in_excess); | ||
6118 | } | ||
6315 | mem_cgroup_destroy_all_caches(memcg); | 6119 | mem_cgroup_destroy_all_caches(memcg); |
6316 | vmpressure_cleanup(&memcg->vmpressure); | 6120 | vmpressure_cleanup(&memcg->vmpressure); |
6317 | } | 6121 | } |
@@ -6986,7 +6790,6 @@ static int __init mem_cgroup_init(void) | |||
6986 | { | 6790 | { |
6987 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 6791 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
6988 | enable_swap_cgroup(); | 6792 | enable_swap_cgroup(); |
6989 | mem_cgroup_soft_limit_tree_init(); | ||
6990 | memcg_stock_init(); | 6793 | memcg_stock_init(); |
6991 | return 0; | 6794 | return 0; |
6992 | } | 6795 | } |
diff --git a/mm/memory.c b/mm/memory.c index 2b73dbde2274..ca0003947115 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -3695,7 +3695,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3695 | * but allow concurrent faults), and pte mapped but not yet locked. | 3695 | * but allow concurrent faults), and pte mapped but not yet locked. |
3696 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 3696 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
3697 | */ | 3697 | */ |
3698 | int handle_pte_fault(struct mm_struct *mm, | 3698 | static int handle_pte_fault(struct mm_struct *mm, |
3699 | struct vm_area_struct *vma, unsigned long address, | 3699 | struct vm_area_struct *vma, unsigned long address, |
3700 | pte_t *pte, pmd_t *pmd, unsigned int flags) | 3700 | pte_t *pte, pmd_t *pmd, unsigned int flags) |
3701 | { | 3701 | { |
@@ -3754,22 +3754,14 @@ unlock: | |||
3754 | /* | 3754 | /* |
3755 | * By the time we get here, we already hold the mm semaphore | 3755 | * By the time we get here, we already hold the mm semaphore |
3756 | */ | 3756 | */ |
3757 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3757 | static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3758 | unsigned long address, unsigned int flags) | 3758 | unsigned long address, unsigned int flags) |
3759 | { | 3759 | { |
3760 | pgd_t *pgd; | 3760 | pgd_t *pgd; |
3761 | pud_t *pud; | 3761 | pud_t *pud; |
3762 | pmd_t *pmd; | 3762 | pmd_t *pmd; |
3763 | pte_t *pte; | 3763 | pte_t *pte; |
3764 | 3764 | ||
3765 | __set_current_state(TASK_RUNNING); | ||
3766 | |||
3767 | count_vm_event(PGFAULT); | ||
3768 | mem_cgroup_count_vm_event(mm, PGFAULT); | ||
3769 | |||
3770 | /* do counter updates before entering really critical section. */ | ||
3771 | check_sync_rss_stat(current); | ||
3772 | |||
3773 | if (unlikely(is_vm_hugetlb_page(vma))) | 3765 | if (unlikely(is_vm_hugetlb_page(vma))) |
3774 | return hugetlb_fault(mm, vma, address, flags); | 3766 | return hugetlb_fault(mm, vma, address, flags); |
3775 | 3767 | ||
@@ -3782,9 +3774,12 @@ retry: | |||
3782 | if (!pmd) | 3774 | if (!pmd) |
3783 | return VM_FAULT_OOM; | 3775 | return VM_FAULT_OOM; |
3784 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { | 3776 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { |
3777 | int ret = VM_FAULT_FALLBACK; | ||
3785 | if (!vma->vm_ops) | 3778 | if (!vma->vm_ops) |
3786 | return do_huge_pmd_anonymous_page(mm, vma, address, | 3779 | ret = do_huge_pmd_anonymous_page(mm, vma, address, |
3787 | pmd, flags); | 3780 | pmd, flags); |
3781 | if (!(ret & VM_FAULT_FALLBACK)) | ||
3782 | return ret; | ||
3788 | } else { | 3783 | } else { |
3789 | pmd_t orig_pmd = *pmd; | 3784 | pmd_t orig_pmd = *pmd; |
3790 | int ret; | 3785 | int ret; |
@@ -3850,6 +3845,37 @@ retry: | |||
3850 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 3845 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); |
3851 | } | 3846 | } |
3852 | 3847 | ||
3848 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3849 | unsigned long address, unsigned int flags) | ||
3850 | { | ||
3851 | int ret; | ||
3852 | |||
3853 | __set_current_state(TASK_RUNNING); | ||
3854 | |||
3855 | count_vm_event(PGFAULT); | ||
3856 | mem_cgroup_count_vm_event(mm, PGFAULT); | ||
3857 | |||
3858 | /* do counter updates before entering really critical section. */ | ||
3859 | check_sync_rss_stat(current); | ||
3860 | |||
3861 | /* | ||
3862 | * Enable the memcg OOM handling for faults triggered in user | ||
3863 | * space. Kernel faults are handled more gracefully. | ||
3864 | */ | ||
3865 | if (flags & FAULT_FLAG_USER) | ||
3866 | mem_cgroup_enable_oom(); | ||
3867 | |||
3868 | ret = __handle_mm_fault(mm, vma, address, flags); | ||
3869 | |||
3870 | if (flags & FAULT_FLAG_USER) | ||
3871 | mem_cgroup_disable_oom(); | ||
3872 | |||
3873 | if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) | ||
3874 | mem_cgroup_oom_synchronize(); | ||
3875 | |||
3876 | return ret; | ||
3877 | } | ||
3878 | |||
3853 | #ifndef __PAGETABLE_PUD_FOLDED | 3879 | #ifndef __PAGETABLE_PUD_FOLDED |
3854 | /* | 3880 | /* |
3855 | * Allocate page upper directory. | 3881 | * Allocate page upper directory. |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 98e75f2ac7bc..314e9d274381 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -678,9 +678,12 @@ out: | |||
678 | */ | 678 | */ |
679 | void pagefault_out_of_memory(void) | 679 | void pagefault_out_of_memory(void) |
680 | { | 680 | { |
681 | struct zonelist *zonelist = node_zonelist(first_online_node, | 681 | struct zonelist *zonelist; |
682 | GFP_KERNEL); | ||
683 | 682 | ||
683 | if (mem_cgroup_oom_synchronize()) | ||
684 | return; | ||
685 | |||
686 | zonelist = node_zonelist(first_online_node, GFP_KERNEL); | ||
684 | if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { | 687 | if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { |
685 | out_of_memory(NULL, 0, 0, NULL, false); | 688 | out_of_memory(NULL, 0, 0, NULL, false); |
686 | clear_zonelist_oom(zonelist, GFP_KERNEL); | 689 | clear_zonelist_oom(zonelist, GFP_KERNEL); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6c7b0187be8e..f5236f804aa6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2143,11 +2143,17 @@ EXPORT_SYMBOL(account_page_dirtied); | |||
2143 | 2143 | ||
2144 | /* | 2144 | /* |
2145 | * Helper function for set_page_writeback family. | 2145 | * Helper function for set_page_writeback family. |
2146 | * | ||
2147 | * The caller must hold mem_cgroup_begin/end_update_page_stat() lock | ||
2148 | * while calling this function. | ||
2149 | * See test_set_page_writeback for example. | ||
2150 | * | ||
2146 | * NOTE: Unlike account_page_dirtied this does not rely on being atomic | 2151 | * NOTE: Unlike account_page_dirtied this does not rely on being atomic |
2147 | * wrt interrupts. | 2152 | * wrt interrupts. |
2148 | */ | 2153 | */ |
2149 | void account_page_writeback(struct page *page) | 2154 | void account_page_writeback(struct page *page) |
2150 | { | 2155 | { |
2156 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); | ||
2151 | inc_zone_page_state(page, NR_WRITEBACK); | 2157 | inc_zone_page_state(page, NR_WRITEBACK); |
2152 | } | 2158 | } |
2153 | EXPORT_SYMBOL(account_page_writeback); | 2159 | EXPORT_SYMBOL(account_page_writeback); |
@@ -2364,7 +2370,10 @@ int test_clear_page_writeback(struct page *page) | |||
2364 | { | 2370 | { |
2365 | struct address_space *mapping = page_mapping(page); | 2371 | struct address_space *mapping = page_mapping(page); |
2366 | int ret; | 2372 | int ret; |
2373 | bool locked; | ||
2374 | unsigned long memcg_flags; | ||
2367 | 2375 | ||
2376 | mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); | ||
2368 | if (mapping) { | 2377 | if (mapping) { |
2369 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2378 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
2370 | unsigned long flags; | 2379 | unsigned long flags; |
@@ -2385,9 +2394,11 @@ int test_clear_page_writeback(struct page *page) | |||
2385 | ret = TestClearPageWriteback(page); | 2394 | ret = TestClearPageWriteback(page); |
2386 | } | 2395 | } |
2387 | if (ret) { | 2396 | if (ret) { |
2397 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); | ||
2388 | dec_zone_page_state(page, NR_WRITEBACK); | 2398 | dec_zone_page_state(page, NR_WRITEBACK); |
2389 | inc_zone_page_state(page, NR_WRITTEN); | 2399 | inc_zone_page_state(page, NR_WRITTEN); |
2390 | } | 2400 | } |
2401 | mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); | ||
2391 | return ret; | 2402 | return ret; |
2392 | } | 2403 | } |
2393 | 2404 | ||
@@ -2395,7 +2406,10 @@ int test_set_page_writeback(struct page *page) | |||
2395 | { | 2406 | { |
2396 | struct address_space *mapping = page_mapping(page); | 2407 | struct address_space *mapping = page_mapping(page); |
2397 | int ret; | 2408 | int ret; |
2409 | bool locked; | ||
2410 | unsigned long memcg_flags; | ||
2398 | 2411 | ||
2412 | mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); | ||
2399 | if (mapping) { | 2413 | if (mapping) { |
2400 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2414 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
2401 | unsigned long flags; | 2415 | unsigned long flags; |
@@ -2422,6 +2436,7 @@ int test_set_page_writeback(struct page *page) | |||
2422 | } | 2436 | } |
2423 | if (!ret) | 2437 | if (!ret) |
2424 | account_page_writeback(page); | 2438 | account_page_writeback(page); |
2439 | mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); | ||
2425 | return ret; | 2440 | return ret; |
2426 | 2441 | ||
2427 | } | 2442 | } |
@@ -1052,11 +1052,11 @@ void do_page_add_anon_rmap(struct page *page, | |||
1052 | { | 1052 | { |
1053 | int first = atomic_inc_and_test(&page->_mapcount); | 1053 | int first = atomic_inc_and_test(&page->_mapcount); |
1054 | if (first) { | 1054 | if (first) { |
1055 | if (!PageTransHuge(page)) | 1055 | if (PageTransHuge(page)) |
1056 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
1057 | else | ||
1058 | __inc_zone_page_state(page, | 1056 | __inc_zone_page_state(page, |
1059 | NR_ANON_TRANSPARENT_HUGEPAGES); | 1057 | NR_ANON_TRANSPARENT_HUGEPAGES); |
1058 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | ||
1059 | hpage_nr_pages(page)); | ||
1060 | } | 1060 | } |
1061 | if (unlikely(PageKsm(page))) | 1061 | if (unlikely(PageKsm(page))) |
1062 | return; | 1062 | return; |
@@ -1085,10 +1085,10 @@ void page_add_new_anon_rmap(struct page *page, | |||
1085 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 1085 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
1086 | SetPageSwapBacked(page); | 1086 | SetPageSwapBacked(page); |
1087 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 1087 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
1088 | if (!PageTransHuge(page)) | 1088 | if (PageTransHuge(page)) |
1089 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
1090 | else | ||
1091 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1089 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1090 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | ||
1091 | hpage_nr_pages(page)); | ||
1092 | __page_set_anon_rmap(page, vma, address, 1); | 1092 | __page_set_anon_rmap(page, vma, address, 1); |
1093 | if (!mlocked_vma_newpage(vma, page)) { | 1093 | if (!mlocked_vma_newpage(vma, page)) { |
1094 | SetPageActive(page); | 1094 | SetPageActive(page); |
@@ -1111,7 +1111,7 @@ void page_add_file_rmap(struct page *page) | |||
1111 | mem_cgroup_begin_update_page_stat(page, &locked, &flags); | 1111 | mem_cgroup_begin_update_page_stat(page, &locked, &flags); |
1112 | if (atomic_inc_and_test(&page->_mapcount)) { | 1112 | if (atomic_inc_and_test(&page->_mapcount)) { |
1113 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1113 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
1114 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1114 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); |
1115 | } | 1115 | } |
1116 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | 1116 | mem_cgroup_end_update_page_stat(page, &locked, &flags); |
1117 | } | 1117 | } |
@@ -1148,14 +1148,14 @@ void page_remove_rmap(struct page *page) | |||
1148 | goto out; | 1148 | goto out; |
1149 | if (anon) { | 1149 | if (anon) { |
1150 | mem_cgroup_uncharge_page(page); | 1150 | mem_cgroup_uncharge_page(page); |
1151 | if (!PageTransHuge(page)) | 1151 | if (PageTransHuge(page)) |
1152 | __dec_zone_page_state(page, NR_ANON_PAGES); | ||
1153 | else | ||
1154 | __dec_zone_page_state(page, | 1152 | __dec_zone_page_state(page, |
1155 | NR_ANON_TRANSPARENT_HUGEPAGES); | 1153 | NR_ANON_TRANSPARENT_HUGEPAGES); |
1154 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | ||
1155 | -hpage_nr_pages(page)); | ||
1156 | } else { | 1156 | } else { |
1157 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 1157 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
1158 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1158 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); |
1159 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | 1159 | mem_cgroup_end_update_page_stat(page, &locked, &flags); |
1160 | } | 1160 | } |
1161 | if (unlikely(PageMlocked(page))) | 1161 | if (unlikely(PageMlocked(page))) |
@@ -432,6 +432,11 @@ static void activate_page_drain(int cpu) | |||
432 | pagevec_lru_move_fn(pvec, __activate_page, NULL); | 432 | pagevec_lru_move_fn(pvec, __activate_page, NULL); |
433 | } | 433 | } |
434 | 434 | ||
435 | static bool need_activate_page_drain(int cpu) | ||
436 | { | ||
437 | return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; | ||
438 | } | ||
439 | |||
435 | void activate_page(struct page *page) | 440 | void activate_page(struct page *page) |
436 | { | 441 | { |
437 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 442 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
@@ -449,6 +454,11 @@ static inline void activate_page_drain(int cpu) | |||
449 | { | 454 | { |
450 | } | 455 | } |
451 | 456 | ||
457 | static bool need_activate_page_drain(int cpu) | ||
458 | { | ||
459 | return false; | ||
460 | } | ||
461 | |||
452 | void activate_page(struct page *page) | 462 | void activate_page(struct page *page) |
453 | { | 463 | { |
454 | struct zone *zone = page_zone(page); | 464 | struct zone *zone = page_zone(page); |
@@ -701,12 +711,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) | |||
701 | lru_add_drain(); | 711 | lru_add_drain(); |
702 | } | 712 | } |
703 | 713 | ||
704 | /* | 714 | static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); |
705 | * Returns 0 for success | 715 | |
706 | */ | 716 | void lru_add_drain_all(void) |
707 | int lru_add_drain_all(void) | ||
708 | { | 717 | { |
709 | return schedule_on_each_cpu(lru_add_drain_per_cpu); | 718 | static DEFINE_MUTEX(lock); |
719 | static struct cpumask has_work; | ||
720 | int cpu; | ||
721 | |||
722 | mutex_lock(&lock); | ||
723 | get_online_cpus(); | ||
724 | cpumask_clear(&has_work); | ||
725 | |||
726 | for_each_online_cpu(cpu) { | ||
727 | struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); | ||
728 | |||
729 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || | ||
730 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || | ||
731 | pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || | ||
732 | need_activate_page_drain(cpu)) { | ||
733 | INIT_WORK(work, lru_add_drain_per_cpu); | ||
734 | schedule_work_on(cpu, work); | ||
735 | cpumask_set_cpu(cpu, &has_work); | ||
736 | } | ||
737 | } | ||
738 | |||
739 | for_each_cpu(cpu, &has_work) | ||
740 | flush_work(&per_cpu(lru_add_drain_work, cpu)); | ||
741 | |||
742 | put_online_cpus(); | ||
743 | mutex_unlock(&lock); | ||
710 | } | 744 | } |
711 | 745 | ||
712 | /* | 746 | /* |
diff --git a/mm/truncate.c b/mm/truncate.c index e2e8a8a7eb9d..353b683afd6e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -567,7 +567,6 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
567 | /** | 567 | /** |
568 | * truncate_pagecache - unmap and remove pagecache that has been truncated | 568 | * truncate_pagecache - unmap and remove pagecache that has been truncated |
569 | * @inode: inode | 569 | * @inode: inode |
570 | * @oldsize: old file size | ||
571 | * @newsize: new file size | 570 | * @newsize: new file size |
572 | * | 571 | * |
573 | * inode's new i_size must already be written before truncate_pagecache | 572 | * inode's new i_size must already be written before truncate_pagecache |
@@ -580,7 +579,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
580 | * situations such as writepage being called for a page that has already | 579 | * situations such as writepage being called for a page that has already |
581 | * had its underlying blocks deallocated. | 580 | * had its underlying blocks deallocated. |
582 | */ | 581 | */ |
583 | void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) | 582 | void truncate_pagecache(struct inode *inode, loff_t newsize) |
584 | { | 583 | { |
585 | struct address_space *mapping = inode->i_mapping; | 584 | struct address_space *mapping = inode->i_mapping; |
586 | loff_t holebegin = round_up(newsize, PAGE_SIZE); | 585 | loff_t holebegin = round_up(newsize, PAGE_SIZE); |
@@ -614,12 +613,8 @@ EXPORT_SYMBOL(truncate_pagecache); | |||
614 | */ | 613 | */ |
615 | void truncate_setsize(struct inode *inode, loff_t newsize) | 614 | void truncate_setsize(struct inode *inode, loff_t newsize) |
616 | { | 615 | { |
617 | loff_t oldsize; | ||
618 | |||
619 | oldsize = inode->i_size; | ||
620 | i_size_write(inode, newsize); | 616 | i_size_write(inode, newsize); |
621 | 617 | truncate_pagecache(inode, newsize); | |
622 | truncate_pagecache(inode, oldsize, newsize); | ||
623 | } | 618 | } |
624 | EXPORT_SYMBOL(truncate_setsize); | 619 | EXPORT_SYMBOL(truncate_setsize); |
625 | 620 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index beb35778c69f..8ed1b775bdc9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -139,11 +139,23 @@ static bool global_reclaim(struct scan_control *sc) | |||
139 | { | 139 | { |
140 | return !sc->target_mem_cgroup; | 140 | return !sc->target_mem_cgroup; |
141 | } | 141 | } |
142 | |||
143 | static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) | ||
144 | { | ||
145 | struct mem_cgroup *root = sc->target_mem_cgroup; | ||
146 | return !mem_cgroup_disabled() && | ||
147 | mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE; | ||
148 | } | ||
142 | #else | 149 | #else |
143 | static bool global_reclaim(struct scan_control *sc) | 150 | static bool global_reclaim(struct scan_control *sc) |
144 | { | 151 | { |
145 | return true; | 152 | return true; |
146 | } | 153 | } |
154 | |||
155 | static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc) | ||
156 | { | ||
157 | return false; | ||
158 | } | ||
147 | #endif | 159 | #endif |
148 | 160 | ||
149 | unsigned long zone_reclaimable_pages(struct zone *zone) | 161 | unsigned long zone_reclaimable_pages(struct zone *zone) |
@@ -2164,9 +2176,11 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2164 | } | 2176 | } |
2165 | } | 2177 | } |
2166 | 2178 | ||
2167 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | 2179 | static int |
2180 | __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim) | ||
2168 | { | 2181 | { |
2169 | unsigned long nr_reclaimed, nr_scanned; | 2182 | unsigned long nr_reclaimed, nr_scanned; |
2183 | int groups_scanned = 0; | ||
2170 | 2184 | ||
2171 | do { | 2185 | do { |
2172 | struct mem_cgroup *root = sc->target_mem_cgroup; | 2186 | struct mem_cgroup *root = sc->target_mem_cgroup; |
@@ -2174,15 +2188,17 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2174 | .zone = zone, | 2188 | .zone = zone, |
2175 | .priority = sc->priority, | 2189 | .priority = sc->priority, |
2176 | }; | 2190 | }; |
2177 | struct mem_cgroup *memcg; | 2191 | struct mem_cgroup *memcg = NULL; |
2192 | mem_cgroup_iter_filter filter = (soft_reclaim) ? | ||
2193 | mem_cgroup_soft_reclaim_eligible : NULL; | ||
2178 | 2194 | ||
2179 | nr_reclaimed = sc->nr_reclaimed; | 2195 | nr_reclaimed = sc->nr_reclaimed; |
2180 | nr_scanned = sc->nr_scanned; | 2196 | nr_scanned = sc->nr_scanned; |
2181 | 2197 | ||
2182 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 2198 | while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { |
2183 | do { | ||
2184 | struct lruvec *lruvec; | 2199 | struct lruvec *lruvec; |
2185 | 2200 | ||
2201 | groups_scanned++; | ||
2186 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2202 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2187 | 2203 | ||
2188 | shrink_lruvec(lruvec, sc); | 2204 | shrink_lruvec(lruvec, sc); |
@@ -2202,8 +2218,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2202 | mem_cgroup_iter_break(root, memcg); | 2218 | mem_cgroup_iter_break(root, memcg); |
2203 | break; | 2219 | break; |
2204 | } | 2220 | } |
2205 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 2221 | } |
2206 | } while (memcg); | ||
2207 | 2222 | ||
2208 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | 2223 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, |
2209 | sc->nr_scanned - nr_scanned, | 2224 | sc->nr_scanned - nr_scanned, |
@@ -2211,6 +2226,37 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2211 | 2226 | ||
2212 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | 2227 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, |
2213 | sc->nr_scanned - nr_scanned, sc)); | 2228 | sc->nr_scanned - nr_scanned, sc)); |
2229 | |||
2230 | return groups_scanned; | ||
2231 | } | ||
2232 | |||
2233 | |||
2234 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | ||
2235 | { | ||
2236 | bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc); | ||
2237 | unsigned long nr_scanned = sc->nr_scanned; | ||
2238 | int scanned_groups; | ||
2239 | |||
2240 | scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim); | ||
2241 | /* | ||
2242 | * memcg iterator might race with other reclaimer or start from | ||
2243 | * a incomplete tree walk so the tree walk in __shrink_zone | ||
2244 | * might have missed groups that are above the soft limit. Try | ||
2245 | * another loop to catch up with others. Do it just once to | ||
2246 | * prevent from reclaim latencies when other reclaimers always | ||
2247 | * preempt this one. | ||
2248 | */ | ||
2249 | if (do_soft_reclaim && !scanned_groups) | ||
2250 | __shrink_zone(zone, sc, do_soft_reclaim); | ||
2251 | |||
2252 | /* | ||
2253 | * No group is over the soft limit or those that are do not have | ||
2254 | * pages in the zone we are reclaiming so we have to reclaim everybody | ||
2255 | */ | ||
2256 | if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) { | ||
2257 | __shrink_zone(zone, sc, false); | ||
2258 | return; | ||
2259 | } | ||
2214 | } | 2260 | } |
2215 | 2261 | ||
2216 | /* Returns true if compaction should go ahead for a high-order request */ | 2262 | /* Returns true if compaction should go ahead for a high-order request */ |
@@ -2274,8 +2320,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2274 | { | 2320 | { |
2275 | struct zoneref *z; | 2321 | struct zoneref *z; |
2276 | struct zone *zone; | 2322 | struct zone *zone; |
2277 | unsigned long nr_soft_reclaimed; | ||
2278 | unsigned long nr_soft_scanned; | ||
2279 | bool aborted_reclaim = false; | 2323 | bool aborted_reclaim = false; |
2280 | 2324 | ||
2281 | /* | 2325 | /* |
@@ -2315,18 +2359,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2315 | continue; | 2359 | continue; |
2316 | } | 2360 | } |
2317 | } | 2361 | } |
2318 | /* | ||
2319 | * This steals pages from memory cgroups over softlimit | ||
2320 | * and returns the number of reclaimed pages and | ||
2321 | * scanned pages. This works for global memory pressure | ||
2322 | * and balancing, not for a memcg's limit. | ||
2323 | */ | ||
2324 | nr_soft_scanned = 0; | ||
2325 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
2326 | sc->order, sc->gfp_mask, | ||
2327 | &nr_soft_scanned); | ||
2328 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
2329 | sc->nr_scanned += nr_soft_scanned; | ||
2330 | /* need some check for avoid more shrink_zone() */ | 2362 | /* need some check for avoid more shrink_zone() */ |
2331 | } | 2363 | } |
2332 | 2364 | ||
@@ -2920,8 +2952,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2920 | { | 2952 | { |
2921 | int i; | 2953 | int i; |
2922 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2954 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2923 | unsigned long nr_soft_reclaimed; | ||
2924 | unsigned long nr_soft_scanned; | ||
2925 | struct scan_control sc = { | 2955 | struct scan_control sc = { |
2926 | .gfp_mask = GFP_KERNEL, | 2956 | .gfp_mask = GFP_KERNEL, |
2927 | .priority = DEF_PRIORITY, | 2957 | .priority = DEF_PRIORITY, |
@@ -3036,15 +3066,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3036 | 3066 | ||
3037 | sc.nr_scanned = 0; | 3067 | sc.nr_scanned = 0; |
3038 | 3068 | ||
3039 | nr_soft_scanned = 0; | ||
3040 | /* | ||
3041 | * Call soft limit reclaim before calling shrink_zone. | ||
3042 | */ | ||
3043 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
3044 | order, sc.gfp_mask, | ||
3045 | &nr_soft_scanned); | ||
3046 | sc.nr_reclaimed += nr_soft_reclaimed; | ||
3047 | |||
3048 | /* | 3069 | /* |
3049 | * There should be no need to raise the scanning | 3070 | * There should be no need to raise the scanning |
3050 | * priority if enough pages are already being scanned | 3071 | * priority if enough pages are already being scanned |