aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig25
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/compaction.c14
-rw-r--r--mm/filemap.c21
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/huge_memory.c404
-rw-r--r--mm/hugetlb.c127
-rw-r--r--mm/hwpoison-inject.c5
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/list_lru.c3
-rw-r--r--mm/madvise.c5
-rw-r--r--mm/memblock.c124
-rw-r--r--mm/memcontrol.c931
-rw-r--r--mm/memory-failure.c46
-rw-r--r--mm/memory.c224
-rw-r--r--mm/memory_hotplug.c65
-rw-r--r--mm/mempolicy.c149
-rw-r--r--mm/migrate.c67
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/mm_init.c18
-rw-r--r--mm/mmap.c23
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mprotect.c76
-rw-r--r--mm/mremap.c5
-rw-r--r--mm/nobootmem.c25
-rw-r--r--mm/nommu.c5
-rw-r--r--mm/oom_kill.c8
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c42
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu.c5
-rw-r--r--mm/pgtable-generic.c16
-rw-r--r--mm/readahead.c8
-rw-r--r--mm/rmap.c15
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slab_common.c4
-rw-r--r--mm/slub.c4
-rw-r--r--mm/sparse.c53
-rw-r--r--mm/swap.c3
-rw-r--r--mm/swapfile.c20
-rw-r--r--mm/util.c13
-rw-r--r--mm/vmalloc.c48
-rw-r--r--mm/vmscan.c88
-rw-r--r--mm/vmstat.c22
-rw-r--r--mm/zswap.c199
48 files changed, 1774 insertions, 1199 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 026771a9b097..eb69f352401d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -20,7 +20,7 @@ config FLATMEM_MANUAL
20 20
21 Some users of more advanced features like NUMA and 21 Some users of more advanced features like NUMA and
22 memory hotplug may have different options here. 22 memory hotplug may have different options here.
23 DISCONTIGMEM is an more mature, better tested system, 23 DISCONTIGMEM is a more mature, better tested system,
24 but is incompatible with memory hotplug and may suffer 24 but is incompatible with memory hotplug and may suffer
25 decreased performance over SPARSEMEM. If unsure between 25 decreased performance over SPARSEMEM. If unsure between
26 "Sparse Memory" and "Discontiguous Memory", choose 26 "Sparse Memory" and "Discontiguous Memory", choose
@@ -153,11 +153,18 @@ config MOVABLE_NODE
153 help 153 help
154 Allow a node to have only movable memory. Pages used by the kernel, 154 Allow a node to have only movable memory. Pages used by the kernel,
155 such as direct mapping pages cannot be migrated. So the corresponding 155 such as direct mapping pages cannot be migrated. So the corresponding
156 memory device cannot be hotplugged. This option allows users to 156 memory device cannot be hotplugged. This option allows the following
157 online all the memory of a node as movable memory so that the whole 157 two things:
158 node can be hotplugged. Users who don't use the memory hotplug 158 - When the system is booting, node full of hotpluggable memory can
159 feature are fine with this option on since they don't online memory 159 be arranged to have only movable memory so that the whole node can
160 as movable. 160 be hot-removed. (need movable_node boot option specified).
161 - After the system is up, the option allows users to online all the
162 memory of a node as movable memory so that the whole node can be
163 hot-removed.
164
165 Users who don't use the memory hotplug feature are fine with this
166 option on since they don't specify movable_node boot option or they
167 don't online memory as movable.
161 168
162 Say Y here if you want to hotplug a whole node. 169 Say Y here if you want to hotplug a whole node.
163 Say N here if you want kernel to use memory on all nodes evenly. 170 Say N here if you want kernel to use memory on all nodes evenly.
@@ -183,7 +190,7 @@ config MEMORY_HOTPLUG_SPARSE
183config MEMORY_HOTREMOVE 190config MEMORY_HOTREMOVE
184 bool "Allow for memory hot remove" 191 bool "Allow for memory hot remove"
185 select MEMORY_ISOLATION 192 select MEMORY_ISOLATION
186 select HAVE_BOOTMEM_INFO_NODE if X86_64 193 select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
187 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE 194 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
188 depends on MIGRATION 195 depends on MIGRATION
189 196
@@ -211,9 +218,11 @@ config SPLIT_PTLOCK_CPUS
211 int 218 int
212 default "999999" if ARM && !CPU_CACHE_VIPT 219 default "999999" if ARM && !CPU_CACHE_VIPT
213 default "999999" if PARISC && !PA20 220 default "999999" if PARISC && !PA20
214 default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
215 default "4" 221 default "4"
216 222
223config ARCH_ENABLE_SPLIT_PMD_PTLOCK
224 boolean
225
217# 226#
218# support for memory balloon compaction 227# support for memory balloon compaction
219config BALLOON_COMPACTION 228config BALLOON_COMPACTION
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 6ab7744e692e..90bd3507b413 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -172,11 +172,12 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
172static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 172static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
173{ 173{
174 struct page *page; 174 struct page *page;
175 unsigned long start, end, pages, count = 0; 175 unsigned long *map, start, end, pages, count = 0;
176 176
177 if (!bdata->node_bootmem_map) 177 if (!bdata->node_bootmem_map)
178 return 0; 178 return 0;
179 179
180 map = bdata->node_bootmem_map;
180 start = bdata->node_min_pfn; 181 start = bdata->node_min_pfn;
181 end = bdata->node_low_pfn; 182 end = bdata->node_low_pfn;
182 183
@@ -184,10 +185,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
184 bdata - bootmem_node_data, start, end); 185 bdata - bootmem_node_data, start, end);
185 186
186 while (start < end) { 187 while (start < end) {
187 unsigned long *map, idx, vec; 188 unsigned long idx, vec;
188 unsigned shift; 189 unsigned shift;
189 190
190 map = bdata->node_bootmem_map;
191 idx = start - bdata->node_min_pfn; 191 idx = start - bdata->node_min_pfn;
192 shift = idx & (BITS_PER_LONG - 1); 192 shift = idx & (BITS_PER_LONG - 1);
193 /* 193 /*
@@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
784 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 784 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
785 785
786 /* update goal according ...MAX_DMA32_PFN */ 786 /* update goal according ...MAX_DMA32_PFN */
787 end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; 787 end_pfn = pgdat_end_pfn(pgdat);
788 788
789 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && 789 if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
790 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { 790 (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
diff --git a/mm/bounce.c b/mm/bounce.c
index c9f0a4339a7d..5a7d58fb883b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -204,6 +204,8 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
204 struct bio_vec *to, *from; 204 struct bio_vec *to, *from;
205 unsigned i; 205 unsigned i;
206 206
207 if (force)
208 goto bounce;
207 bio_for_each_segment(from, *bio_orig, i) 209 bio_for_each_segment(from, *bio_orig, i)
208 if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q)) 210 if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
209 goto bounce; 211 goto bounce;
diff --git a/mm/compaction.c b/mm/compaction.c
index c43789388cd8..805165bcd3dd 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -235,10 +235,9 @@ static bool suitable_migration_target(struct page *page)
235} 235}
236 236
237/* 237/*
238 * Isolate free pages onto a private freelist. Caller must hold zone->lock. 238 * Isolate free pages onto a private freelist. If @strict is true, will abort
239 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free 239 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
240 * pages inside of the pageblock (even though it may still end up isolating 240 * (even though it may still end up isolating some pages).
241 * some pages).
242 */ 241 */
243static unsigned long isolate_freepages_block(struct compact_control *cc, 242static unsigned long isolate_freepages_block(struct compact_control *cc,
244 unsigned long blockpfn, 243 unsigned long blockpfn,
@@ -677,6 +676,13 @@ static void isolate_freepages(struct zone *zone,
677 pfn -= pageblock_nr_pages) { 676 pfn -= pageblock_nr_pages) {
678 unsigned long isolated; 677 unsigned long isolated;
679 678
679 /*
680 * This can iterate a massively long zone without finding any
681 * suitable migration targets, so periodically check if we need
682 * to schedule.
683 */
684 cond_resched();
685
680 if (!pfn_valid(pfn)) 686 if (!pfn_valid(pfn))
681 continue; 687 continue;
682 688
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6aec4a2d2e..b7749a92021c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1090,7 +1090,6 @@ static void shrink_readahead_size_eio(struct file *filp,
1090 * @filp: the file to read 1090 * @filp: the file to read
1091 * @ppos: current file position 1091 * @ppos: current file position
1092 * @desc: read_descriptor 1092 * @desc: read_descriptor
1093 * @actor: read method
1094 * 1093 *
1095 * This is a generic file read routine, and uses the 1094 * This is a generic file read routine, and uses the
1096 * mapping->a_ops->readpage() function for the actual low-level stuff. 1095 * mapping->a_ops->readpage() function for the actual low-level stuff.
@@ -1099,7 +1098,7 @@ static void shrink_readahead_size_eio(struct file *filp,
1099 * of the logic when it comes to error handling etc. 1098 * of the logic when it comes to error handling etc.
1100 */ 1099 */
1101static void do_generic_file_read(struct file *filp, loff_t *ppos, 1100static void do_generic_file_read(struct file *filp, loff_t *ppos,
1102 read_descriptor_t *desc, read_actor_t actor) 1101 read_descriptor_t *desc)
1103{ 1102{
1104 struct address_space *mapping = filp->f_mapping; 1103 struct address_space *mapping = filp->f_mapping;
1105 struct inode *inode = mapping->host; 1104 struct inode *inode = mapping->host;
@@ -1200,13 +1199,14 @@ page_ok:
1200 * Ok, we have the page, and it's up-to-date, so 1199 * Ok, we have the page, and it's up-to-date, so
1201 * now we can copy it to user space... 1200 * now we can copy it to user space...
1202 * 1201 *
1203 * The actor routine returns how many bytes were actually used.. 1202 * The file_read_actor routine returns how many bytes were
1203 * actually used..
1204 * NOTE! This may not be the same as how much of a user buffer 1204 * NOTE! This may not be the same as how much of a user buffer
1205 * we filled up (we may be padding etc), so we can only update 1205 * we filled up (we may be padding etc), so we can only update
1206 * "pos" here (the actor routine has to update the user buffer 1206 * "pos" here (the actor routine has to update the user buffer
1207 * pointers and the remaining count). 1207 * pointers and the remaining count).
1208 */ 1208 */
1209 ret = actor(desc, page, offset, nr); 1209 ret = file_read_actor(desc, page, offset, nr);
1210 offset += ret; 1210 offset += ret;
1211 index += offset >> PAGE_CACHE_SHIFT; 1211 index += offset >> PAGE_CACHE_SHIFT;
1212 offset &= ~PAGE_CACHE_MASK; 1212 offset &= ~PAGE_CACHE_MASK;
@@ -1479,7 +1479,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1479 if (desc.count == 0) 1479 if (desc.count == 0)
1480 continue; 1480 continue;
1481 desc.error = 0; 1481 desc.error = 0;
1482 do_generic_file_read(filp, ppos, &desc, file_read_actor); 1482 do_generic_file_read(filp, ppos, &desc);
1483 retval += desc.written; 1483 retval += desc.written;
1484 if (desc.error) { 1484 if (desc.error) {
1485 retval = retval ?: desc.error; 1485 retval = retval ?: desc.error;
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1616 struct inode *inode = mapping->host; 1616 struct inode *inode = mapping->host;
1617 pgoff_t offset = vmf->pgoff; 1617 pgoff_t offset = vmf->pgoff;
1618 struct page *page; 1618 struct page *page;
1619 bool memcg_oom;
1620 pgoff_t size; 1619 pgoff_t size;
1621 int ret = 0; 1620 int ret = 0;
1622 1621
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1625 return VM_FAULT_SIGBUS; 1624 return VM_FAULT_SIGBUS;
1626 1625
1627 /* 1626 /*
1628 * Do we have something in the page cache already? Either 1627 * Do we have something in the page cache already?
1629 * way, try readahead, but disable the memcg OOM killer for it
1630 * as readahead is optional and no errors are propagated up
1631 * the fault stack. The OOM killer is enabled while trying to
1632 * instantiate the faulting page individually below.
1633 */ 1628 */
1634 page = find_get_page(mapping, offset); 1629 page = find_get_page(mapping, offset);
1635 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { 1630 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1637 * We found the page, so try async readahead before 1632 * We found the page, so try async readahead before
1638 * waiting for the lock. 1633 * waiting for the lock.
1639 */ 1634 */
1640 memcg_oom = mem_cgroup_toggle_oom(false);
1641 do_async_mmap_readahead(vma, ra, file, page, offset); 1635 do_async_mmap_readahead(vma, ra, file, page, offset);
1642 mem_cgroup_toggle_oom(memcg_oom);
1643 } else if (!page) { 1636 } else if (!page) {
1644 /* No page in the page cache at all */ 1637 /* No page in the page cache at all */
1645 memcg_oom = mem_cgroup_toggle_oom(false);
1646 do_sync_mmap_readahead(vma, ra, file, offset); 1638 do_sync_mmap_readahead(vma, ra, file, offset);
1647 mem_cgroup_toggle_oom(memcg_oom);
1648 count_vm_event(PGMAJFAULT); 1639 count_vm_event(PGMAJFAULT);
1649 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1640 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1650 ret = VM_FAULT_MAJOR; 1641 ret = VM_FAULT_MAJOR;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 28fe26b64f8a..d8d9fe3f685c 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -26,7 +26,7 @@
26 * of ZERO_PAGE(), such as /dev/zero 26 * of ZERO_PAGE(), such as /dev/zero
27 */ 27 */
28static DEFINE_MUTEX(xip_sparse_mutex); 28static DEFINE_MUTEX(xip_sparse_mutex);
29static seqcount_t xip_sparse_seq = SEQCNT_ZERO; 29static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq);
30static struct page *__xip_sparse_page; 30static struct page *__xip_sparse_page;
31 31
32/* called under xip_sparse_mutex */ 32/* called under xip_sparse_mutex */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7489884682d8..bccd5a628ea6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -27,11 +27,12 @@
27#include "internal.h" 27#include "internal.h"
28 28
29/* 29/*
30 * By default transparent hugepage support is enabled for all mappings 30 * By default transparent hugepage support is disabled in order that avoid
31 * and khugepaged scans all mappings. Defrag is only invoked by 31 * to risk increase the memory footprint of applications without a guaranteed
32 * khugepaged hugepage allocations and by page faults inside 32 * benefit. When transparent hugepage support is enabled, is for all mappings,
33 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived 33 * and khugepaged scans all mappings.
34 * allocations. 34 * Defrag is invoked by khugepaged hugepage allocations and by page faults
35 * for all hugepage allocations.
35 */ 36 */
36unsigned long transparent_hugepage_flags __read_mostly = 37unsigned long transparent_hugepage_flags __read_mostly =
37#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS 38#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
@@ -709,6 +710,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
709 struct page *page) 710 struct page *page)
710{ 711{
711 pgtable_t pgtable; 712 pgtable_t pgtable;
713 spinlock_t *ptl;
712 714
713 VM_BUG_ON(!PageCompound(page)); 715 VM_BUG_ON(!PageCompound(page));
714 pgtable = pte_alloc_one(mm, haddr); 716 pgtable = pte_alloc_one(mm, haddr);
@@ -723,9 +725,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
723 */ 725 */
724 __SetPageUptodate(page); 726 __SetPageUptodate(page);
725 727
726 spin_lock(&mm->page_table_lock); 728 ptl = pmd_lock(mm, pmd);
727 if (unlikely(!pmd_none(*pmd))) { 729 if (unlikely(!pmd_none(*pmd))) {
728 spin_unlock(&mm->page_table_lock); 730 spin_unlock(ptl);
729 mem_cgroup_uncharge_page(page); 731 mem_cgroup_uncharge_page(page);
730 put_page(page); 732 put_page(page);
731 pte_free(mm, pgtable); 733 pte_free(mm, pgtable);
@@ -737,8 +739,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
737 pgtable_trans_huge_deposit(mm, pmd, pgtable); 739 pgtable_trans_huge_deposit(mm, pmd, pgtable);
738 set_pmd_at(mm, haddr, pmd, entry); 740 set_pmd_at(mm, haddr, pmd, entry);
739 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 741 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
740 mm->nr_ptes++; 742 atomic_long_inc(&mm->nr_ptes);
741 spin_unlock(&mm->page_table_lock); 743 spin_unlock(ptl);
742 } 744 }
743 745
744 return 0; 746 return 0;
@@ -758,14 +760,7 @@ static inline struct page *alloc_hugepage_vma(int defrag,
758 HPAGE_PMD_ORDER, vma, haddr, nd); 760 HPAGE_PMD_ORDER, vma, haddr, nd);
759} 761}
760 762
761#ifndef CONFIG_NUMA 763/* Caller must hold page table lock. */
762static inline struct page *alloc_hugepage(int defrag)
763{
764 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
765 HPAGE_PMD_ORDER);
766}
767#endif
768
769static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 764static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
770 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 765 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
771 struct page *zero_page) 766 struct page *zero_page)
@@ -778,7 +773,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
778 entry = pmd_mkhuge(entry); 773 entry = pmd_mkhuge(entry);
779 pgtable_trans_huge_deposit(mm, pmd, pgtable); 774 pgtable_trans_huge_deposit(mm, pmd, pgtable);
780 set_pmd_at(mm, haddr, pmd, entry); 775 set_pmd_at(mm, haddr, pmd, entry);
781 mm->nr_ptes++; 776 atomic_long_inc(&mm->nr_ptes);
782 return true; 777 return true;
783} 778}
784 779
@@ -797,6 +792,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
797 return VM_FAULT_OOM; 792 return VM_FAULT_OOM;
798 if (!(flags & FAULT_FLAG_WRITE) && 793 if (!(flags & FAULT_FLAG_WRITE) &&
799 transparent_hugepage_use_zero_page()) { 794 transparent_hugepage_use_zero_page()) {
795 spinlock_t *ptl;
800 pgtable_t pgtable; 796 pgtable_t pgtable;
801 struct page *zero_page; 797 struct page *zero_page;
802 bool set; 798 bool set;
@@ -809,10 +805,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
809 count_vm_event(THP_FAULT_FALLBACK); 805 count_vm_event(THP_FAULT_FALLBACK);
810 return VM_FAULT_FALLBACK; 806 return VM_FAULT_FALLBACK;
811 } 807 }
812 spin_lock(&mm->page_table_lock); 808 ptl = pmd_lock(mm, pmd);
813 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, 809 set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
814 zero_page); 810 zero_page);
815 spin_unlock(&mm->page_table_lock); 811 spin_unlock(ptl);
816 if (!set) { 812 if (!set) {
817 pte_free(mm, pgtable); 813 pte_free(mm, pgtable);
818 put_huge_zero_page(); 814 put_huge_zero_page();
@@ -845,6 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
845 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 841 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
846 struct vm_area_struct *vma) 842 struct vm_area_struct *vma)
847{ 843{
844 spinlock_t *dst_ptl, *src_ptl;
848 struct page *src_page; 845 struct page *src_page;
849 pmd_t pmd; 846 pmd_t pmd;
850 pgtable_t pgtable; 847 pgtable_t pgtable;
@@ -855,8 +852,9 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
855 if (unlikely(!pgtable)) 852 if (unlikely(!pgtable))
856 goto out; 853 goto out;
857 854
858 spin_lock(&dst_mm->page_table_lock); 855 dst_ptl = pmd_lock(dst_mm, dst_pmd);
859 spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); 856 src_ptl = pmd_lockptr(src_mm, src_pmd);
857 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
860 858
861 ret = -EAGAIN; 859 ret = -EAGAIN;
862 pmd = *src_pmd; 860 pmd = *src_pmd;
@@ -865,7 +863,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
865 goto out_unlock; 863 goto out_unlock;
866 } 864 }
867 /* 865 /*
868 * mm->page_table_lock is enough to be sure that huge zero pmd is not 866 * When page table lock is held, the huge zero pmd should not be
869 * under splitting since we don't split the page itself, only pmd to 867 * under splitting since we don't split the page itself, only pmd to
870 * a page table. 868 * a page table.
871 */ 869 */
@@ -886,8 +884,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
886 } 884 }
887 if (unlikely(pmd_trans_splitting(pmd))) { 885 if (unlikely(pmd_trans_splitting(pmd))) {
888 /* split huge page running from under us */ 886 /* split huge page running from under us */
889 spin_unlock(&src_mm->page_table_lock); 887 spin_unlock(src_ptl);
890 spin_unlock(&dst_mm->page_table_lock); 888 spin_unlock(dst_ptl);
891 pte_free(dst_mm, pgtable); 889 pte_free(dst_mm, pgtable);
892 890
893 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ 891 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
@@ -903,12 +901,12 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
903 pmd = pmd_mkold(pmd_wrprotect(pmd)); 901 pmd = pmd_mkold(pmd_wrprotect(pmd));
904 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); 902 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
905 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 903 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
906 dst_mm->nr_ptes++; 904 atomic_long_inc(&dst_mm->nr_ptes);
907 905
908 ret = 0; 906 ret = 0;
909out_unlock: 907out_unlock:
910 spin_unlock(&src_mm->page_table_lock); 908 spin_unlock(src_ptl);
911 spin_unlock(&dst_mm->page_table_lock); 909 spin_unlock(dst_ptl);
912out: 910out:
913 return ret; 911 return ret;
914} 912}
@@ -919,10 +917,11 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
919 pmd_t *pmd, pmd_t orig_pmd, 917 pmd_t *pmd, pmd_t orig_pmd,
920 int dirty) 918 int dirty)
921{ 919{
920 spinlock_t *ptl;
922 pmd_t entry; 921 pmd_t entry;
923 unsigned long haddr; 922 unsigned long haddr;
924 923
925 spin_lock(&mm->page_table_lock); 924 ptl = pmd_lock(mm, pmd);
926 if (unlikely(!pmd_same(*pmd, orig_pmd))) 925 if (unlikely(!pmd_same(*pmd, orig_pmd)))
927 goto unlock; 926 goto unlock;
928 927
@@ -932,13 +931,14 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
932 update_mmu_cache_pmd(vma, address, pmd); 931 update_mmu_cache_pmd(vma, address, pmd);
933 932
934unlock: 933unlock:
935 spin_unlock(&mm->page_table_lock); 934 spin_unlock(ptl);
936} 935}
937 936
938static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, 937static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
939 struct vm_area_struct *vma, unsigned long address, 938 struct vm_area_struct *vma, unsigned long address,
940 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) 939 pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
941{ 940{
941 spinlock_t *ptl;
942 pgtable_t pgtable; 942 pgtable_t pgtable;
943 pmd_t _pmd; 943 pmd_t _pmd;
944 struct page *page; 944 struct page *page;
@@ -965,7 +965,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
965 mmun_end = haddr + HPAGE_PMD_SIZE; 965 mmun_end = haddr + HPAGE_PMD_SIZE;
966 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 966 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
967 967
968 spin_lock(&mm->page_table_lock); 968 ptl = pmd_lock(mm, pmd);
969 if (unlikely(!pmd_same(*pmd, orig_pmd))) 969 if (unlikely(!pmd_same(*pmd, orig_pmd)))
970 goto out_free_page; 970 goto out_free_page;
971 971
@@ -992,7 +992,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
992 } 992 }
993 smp_wmb(); /* make pte visible before pmd */ 993 smp_wmb(); /* make pte visible before pmd */
994 pmd_populate(mm, pmd, pgtable); 994 pmd_populate(mm, pmd, pgtable);
995 spin_unlock(&mm->page_table_lock); 995 spin_unlock(ptl);
996 put_huge_zero_page(); 996 put_huge_zero_page();
997 inc_mm_counter(mm, MM_ANONPAGES); 997 inc_mm_counter(mm, MM_ANONPAGES);
998 998
@@ -1002,7 +1002,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
1002out: 1002out:
1003 return ret; 1003 return ret;
1004out_free_page: 1004out_free_page:
1005 spin_unlock(&mm->page_table_lock); 1005 spin_unlock(ptl);
1006 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1006 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1007 mem_cgroup_uncharge_page(page); 1007 mem_cgroup_uncharge_page(page);
1008 put_page(page); 1008 put_page(page);
@@ -1016,6 +1016,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1016 struct page *page, 1016 struct page *page,
1017 unsigned long haddr) 1017 unsigned long haddr)
1018{ 1018{
1019 spinlock_t *ptl;
1019 pgtable_t pgtable; 1020 pgtable_t pgtable;
1020 pmd_t _pmd; 1021 pmd_t _pmd;
1021 int ret = 0, i; 1022 int ret = 0, i;
@@ -1062,7 +1063,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1062 mmun_end = haddr + HPAGE_PMD_SIZE; 1063 mmun_end = haddr + HPAGE_PMD_SIZE;
1063 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1064 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1064 1065
1065 spin_lock(&mm->page_table_lock); 1066 ptl = pmd_lock(mm, pmd);
1066 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1067 if (unlikely(!pmd_same(*pmd, orig_pmd)))
1067 goto out_free_pages; 1068 goto out_free_pages;
1068 VM_BUG_ON(!PageHead(page)); 1069 VM_BUG_ON(!PageHead(page));
@@ -1088,7 +1089,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1088 smp_wmb(); /* make pte visible before pmd */ 1089 smp_wmb(); /* make pte visible before pmd */
1089 pmd_populate(mm, pmd, pgtable); 1090 pmd_populate(mm, pmd, pgtable);
1090 page_remove_rmap(page); 1091 page_remove_rmap(page);
1091 spin_unlock(&mm->page_table_lock); 1092 spin_unlock(ptl);
1092 1093
1093 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1094 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1094 1095
@@ -1099,7 +1100,7 @@ out:
1099 return ret; 1100 return ret;
1100 1101
1101out_free_pages: 1102out_free_pages:
1102 spin_unlock(&mm->page_table_lock); 1103 spin_unlock(ptl);
1103 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1104 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1104 mem_cgroup_uncharge_start(); 1105 mem_cgroup_uncharge_start();
1105 for (i = 0; i < HPAGE_PMD_NR; i++) { 1106 for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1114,17 +1115,19 @@ out_free_pages:
1114int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 1115int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1115 unsigned long address, pmd_t *pmd, pmd_t orig_pmd) 1116 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
1116{ 1117{
1118 spinlock_t *ptl;
1117 int ret = 0; 1119 int ret = 0;
1118 struct page *page = NULL, *new_page; 1120 struct page *page = NULL, *new_page;
1119 unsigned long haddr; 1121 unsigned long haddr;
1120 unsigned long mmun_start; /* For mmu_notifiers */ 1122 unsigned long mmun_start; /* For mmu_notifiers */
1121 unsigned long mmun_end; /* For mmu_notifiers */ 1123 unsigned long mmun_end; /* For mmu_notifiers */
1122 1124
1125 ptl = pmd_lockptr(mm, pmd);
1123 VM_BUG_ON(!vma->anon_vma); 1126 VM_BUG_ON(!vma->anon_vma);
1124 haddr = address & HPAGE_PMD_MASK; 1127 haddr = address & HPAGE_PMD_MASK;
1125 if (is_huge_zero_pmd(orig_pmd)) 1128 if (is_huge_zero_pmd(orig_pmd))
1126 goto alloc; 1129 goto alloc;
1127 spin_lock(&mm->page_table_lock); 1130 spin_lock(ptl);
1128 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1131 if (unlikely(!pmd_same(*pmd, orig_pmd)))
1129 goto out_unlock; 1132 goto out_unlock;
1130 1133
@@ -1140,7 +1143,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1140 goto out_unlock; 1143 goto out_unlock;
1141 } 1144 }
1142 get_page(page); 1145 get_page(page);
1143 spin_unlock(&mm->page_table_lock); 1146 spin_unlock(ptl);
1144alloc: 1147alloc:
1145 if (transparent_hugepage_enabled(vma) && 1148 if (transparent_hugepage_enabled(vma) &&
1146 !transparent_hugepage_debug_cow()) 1149 !transparent_hugepage_debug_cow())
@@ -1187,11 +1190,11 @@ alloc:
1187 mmun_end = haddr + HPAGE_PMD_SIZE; 1190 mmun_end = haddr + HPAGE_PMD_SIZE;
1188 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1191 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1189 1192
1190 spin_lock(&mm->page_table_lock); 1193 spin_lock(ptl);
1191 if (page) 1194 if (page)
1192 put_page(page); 1195 put_page(page);
1193 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1196 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
1194 spin_unlock(&mm->page_table_lock); 1197 spin_unlock(ptl);
1195 mem_cgroup_uncharge_page(new_page); 1198 mem_cgroup_uncharge_page(new_page);
1196 put_page(new_page); 1199 put_page(new_page);
1197 goto out_mn; 1200 goto out_mn;
@@ -1213,13 +1216,13 @@ alloc:
1213 } 1216 }
1214 ret |= VM_FAULT_WRITE; 1217 ret |= VM_FAULT_WRITE;
1215 } 1218 }
1216 spin_unlock(&mm->page_table_lock); 1219 spin_unlock(ptl);
1217out_mn: 1220out_mn:
1218 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1221 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1219out: 1222out:
1220 return ret; 1223 return ret;
1221out_unlock: 1224out_unlock:
1222 spin_unlock(&mm->page_table_lock); 1225 spin_unlock(ptl);
1223 return ret; 1226 return ret;
1224} 1227}
1225 1228
@@ -1231,7 +1234,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1231 struct mm_struct *mm = vma->vm_mm; 1234 struct mm_struct *mm = vma->vm_mm;
1232 struct page *page = NULL; 1235 struct page *page = NULL;
1233 1236
1234 assert_spin_locked(&mm->page_table_lock); 1237 assert_spin_locked(pmd_lockptr(mm, pmd));
1235 1238
1236 if (flags & FOLL_WRITE && !pmd_write(*pmd)) 1239 if (flags & FOLL_WRITE && !pmd_write(*pmd))
1237 goto out; 1240 goto out;
@@ -1278,73 +1281,116 @@ out:
1278int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 1281int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1279 unsigned long addr, pmd_t pmd, pmd_t *pmdp) 1282 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1280{ 1283{
1284 spinlock_t *ptl;
1285 struct anon_vma *anon_vma = NULL;
1281 struct page *page; 1286 struct page *page;
1282 unsigned long haddr = addr & HPAGE_PMD_MASK; 1287 unsigned long haddr = addr & HPAGE_PMD_MASK;
1283 int target_nid; 1288 int page_nid = -1, this_nid = numa_node_id();
1284 int current_nid = -1; 1289 int target_nid, last_cpupid = -1;
1285 bool migrated; 1290 bool page_locked;
1291 bool migrated = false;
1292 int flags = 0;
1286 1293
1287 spin_lock(&mm->page_table_lock); 1294 ptl = pmd_lock(mm, pmdp);
1288 if (unlikely(!pmd_same(pmd, *pmdp))) 1295 if (unlikely(!pmd_same(pmd, *pmdp)))
1289 goto out_unlock; 1296 goto out_unlock;
1290 1297
1291 page = pmd_page(pmd); 1298 page = pmd_page(pmd);
1292 get_page(page); 1299 BUG_ON(is_huge_zero_page(page));
1293 current_nid = page_to_nid(page); 1300 page_nid = page_to_nid(page);
1301 last_cpupid = page_cpupid_last(page);
1294 count_vm_numa_event(NUMA_HINT_FAULTS); 1302 count_vm_numa_event(NUMA_HINT_FAULTS);
1295 if (current_nid == numa_node_id()) 1303 if (page_nid == this_nid) {
1296 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 1304 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1305 flags |= TNF_FAULT_LOCAL;
1306 }
1297 1307
1308 /*
1309 * Avoid grouping on DSO/COW pages in specific and RO pages
1310 * in general, RO pages shouldn't hurt as much anyway since
1311 * they can be in shared cache state.
1312 */
1313 if (!pmd_write(pmd))
1314 flags |= TNF_NO_GROUP;
1315
1316 /*
1317 * Acquire the page lock to serialise THP migrations but avoid dropping
1318 * page_table_lock if at all possible
1319 */
1320 page_locked = trylock_page(page);
1298 target_nid = mpol_misplaced(page, vma, haddr); 1321 target_nid = mpol_misplaced(page, vma, haddr);
1299 if (target_nid == -1) { 1322 if (target_nid == -1) {
1300 put_page(page); 1323 /* If the page was locked, there are no parallel migrations */
1301 goto clear_pmdnuma; 1324 if (page_locked)
1325 goto clear_pmdnuma;
1326
1327 /*
1328 * Otherwise wait for potential migrations and retry. We do
1329 * relock and check_same as the page may no longer be mapped.
1330 * As the fault is being retried, do not account for it.
1331 */
1332 spin_unlock(ptl);
1333 wait_on_page_locked(page);
1334 page_nid = -1;
1335 goto out;
1302 } 1336 }
1303 1337
1304 /* Acquire the page lock to serialise THP migrations */ 1338 /* Page is misplaced, serialise migrations and parallel THP splits */
1305 spin_unlock(&mm->page_table_lock); 1339 get_page(page);
1306 lock_page(page); 1340 spin_unlock(ptl);
1341 if (!page_locked)
1342 lock_page(page);
1343 anon_vma = page_lock_anon_vma_read(page);
1307 1344
1308 /* Confirm the PTE did not while locked */ 1345 /* Confirm the PMD did not change while page_table_lock was released */
1309 spin_lock(&mm->page_table_lock); 1346 spin_lock(ptl);
1310 if (unlikely(!pmd_same(pmd, *pmdp))) { 1347 if (unlikely(!pmd_same(pmd, *pmdp))) {
1311 unlock_page(page); 1348 unlock_page(page);
1312 put_page(page); 1349 put_page(page);
1350 page_nid = -1;
1313 goto out_unlock; 1351 goto out_unlock;
1314 } 1352 }
1315 spin_unlock(&mm->page_table_lock);
1316 1353
1317 /* Migrate the THP to the requested node */ 1354 /*
1355 * Migrate the THP to the requested node, returns with page unlocked
1356 * and pmd_numa cleared.
1357 */
1358 spin_unlock(ptl);
1318 migrated = migrate_misplaced_transhuge_page(mm, vma, 1359 migrated = migrate_misplaced_transhuge_page(mm, vma,
1319 pmdp, pmd, addr, page, target_nid); 1360 pmdp, pmd, addr, page, target_nid);
1320 if (!migrated) 1361 if (migrated) {
1321 goto check_same; 1362 flags |= TNF_MIGRATED;
1322 1363 page_nid = target_nid;
1323 task_numa_fault(target_nid, HPAGE_PMD_NR, true); 1364 }
1324 return 0;
1325 1365
1326check_same: 1366 goto out;
1327 spin_lock(&mm->page_table_lock);
1328 if (unlikely(!pmd_same(pmd, *pmdp)))
1329 goto out_unlock;
1330clear_pmdnuma: 1367clear_pmdnuma:
1368 BUG_ON(!PageLocked(page));
1331 pmd = pmd_mknonnuma(pmd); 1369 pmd = pmd_mknonnuma(pmd);
1332 set_pmd_at(mm, haddr, pmdp, pmd); 1370 set_pmd_at(mm, haddr, pmdp, pmd);
1333 VM_BUG_ON(pmd_numa(*pmdp)); 1371 VM_BUG_ON(pmd_numa(*pmdp));
1334 update_mmu_cache_pmd(vma, addr, pmdp); 1372 update_mmu_cache_pmd(vma, addr, pmdp);
1373 unlock_page(page);
1335out_unlock: 1374out_unlock:
1336 spin_unlock(&mm->page_table_lock); 1375 spin_unlock(ptl);
1337 if (current_nid != -1) 1376
1338 task_numa_fault(current_nid, HPAGE_PMD_NR, false); 1377out:
1378 if (anon_vma)
1379 page_unlock_anon_vma_read(anon_vma);
1380
1381 if (page_nid != -1)
1382 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
1383
1339 return 0; 1384 return 0;
1340} 1385}
1341 1386
1342int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1387int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1343 pmd_t *pmd, unsigned long addr) 1388 pmd_t *pmd, unsigned long addr)
1344{ 1389{
1390 spinlock_t *ptl;
1345 int ret = 0; 1391 int ret = 0;
1346 1392
1347 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1393 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1348 struct page *page; 1394 struct page *page;
1349 pgtable_t pgtable; 1395 pgtable_t pgtable;
1350 pmd_t orig_pmd; 1396 pmd_t orig_pmd;
@@ -1358,8 +1404,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1358 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1404 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1359 pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); 1405 pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
1360 if (is_huge_zero_pmd(orig_pmd)) { 1406 if (is_huge_zero_pmd(orig_pmd)) {
1361 tlb->mm->nr_ptes--; 1407 atomic_long_dec(&tlb->mm->nr_ptes);
1362 spin_unlock(&tlb->mm->page_table_lock); 1408 spin_unlock(ptl);
1363 put_huge_zero_page(); 1409 put_huge_zero_page();
1364 } else { 1410 } else {
1365 page = pmd_page(orig_pmd); 1411 page = pmd_page(orig_pmd);
@@ -1367,8 +1413,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1367 VM_BUG_ON(page_mapcount(page) < 0); 1413 VM_BUG_ON(page_mapcount(page) < 0);
1368 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1414 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1369 VM_BUG_ON(!PageHead(page)); 1415 VM_BUG_ON(!PageHead(page));
1370 tlb->mm->nr_ptes--; 1416 atomic_long_dec(&tlb->mm->nr_ptes);
1371 spin_unlock(&tlb->mm->page_table_lock); 1417 spin_unlock(ptl);
1372 tlb_remove_page(tlb, page); 1418 tlb_remove_page(tlb, page);
1373 } 1419 }
1374 pte_free(tlb->mm, pgtable); 1420 pte_free(tlb->mm, pgtable);
@@ -1381,14 +1427,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1381 unsigned long addr, unsigned long end, 1427 unsigned long addr, unsigned long end,
1382 unsigned char *vec) 1428 unsigned char *vec)
1383{ 1429{
1430 spinlock_t *ptl;
1384 int ret = 0; 1431 int ret = 0;
1385 1432
1386 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1433 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1387 /* 1434 /*
1388 * All logical pages in the range are present 1435 * All logical pages in the range are present
1389 * if backed by a huge page. 1436 * if backed by a huge page.
1390 */ 1437 */
1391 spin_unlock(&vma->vm_mm->page_table_lock); 1438 spin_unlock(ptl);
1392 memset(vec, 1, (end - addr) >> PAGE_SHIFT); 1439 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1393 ret = 1; 1440 ret = 1;
1394 } 1441 }
@@ -1401,6 +1448,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1401 unsigned long new_addr, unsigned long old_end, 1448 unsigned long new_addr, unsigned long old_end,
1402 pmd_t *old_pmd, pmd_t *new_pmd) 1449 pmd_t *old_pmd, pmd_t *new_pmd)
1403{ 1450{
1451 spinlock_t *old_ptl, *new_ptl;
1404 int ret = 0; 1452 int ret = 0;
1405 pmd_t pmd; 1453 pmd_t pmd;
1406 1454
@@ -1421,41 +1469,69 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1421 goto out; 1469 goto out;
1422 } 1470 }
1423 1471
1424 ret = __pmd_trans_huge_lock(old_pmd, vma); 1472 /*
1473 * We don't have to worry about the ordering of src and dst
1474 * ptlocks because exclusive mmap_sem prevents deadlock.
1475 */
1476 ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
1425 if (ret == 1) { 1477 if (ret == 1) {
1478 new_ptl = pmd_lockptr(mm, new_pmd);
1479 if (new_ptl != old_ptl)
1480 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
1426 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); 1481 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1427 VM_BUG_ON(!pmd_none(*new_pmd)); 1482 VM_BUG_ON(!pmd_none(*new_pmd));
1428 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); 1483 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
1429 spin_unlock(&mm->page_table_lock); 1484 if (new_ptl != old_ptl)
1485 spin_unlock(new_ptl);
1486 spin_unlock(old_ptl);
1430 } 1487 }
1431out: 1488out:
1432 return ret; 1489 return ret;
1433} 1490}
1434 1491
1492/*
1493 * Returns
1494 * - 0 if PMD could not be locked
1495 * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
1496 * - HPAGE_PMD_NR is protections changed and TLB flush necessary
1497 */
1435int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1498int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1436 unsigned long addr, pgprot_t newprot, int prot_numa) 1499 unsigned long addr, pgprot_t newprot, int prot_numa)
1437{ 1500{
1438 struct mm_struct *mm = vma->vm_mm; 1501 struct mm_struct *mm = vma->vm_mm;
1502 spinlock_t *ptl;
1439 int ret = 0; 1503 int ret = 0;
1440 1504
1441 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1505 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1442 pmd_t entry; 1506 pmd_t entry;
1443 entry = pmdp_get_and_clear(mm, addr, pmd); 1507 ret = 1;
1444 if (!prot_numa) { 1508 if (!prot_numa) {
1509 entry = pmdp_get_and_clear(mm, addr, pmd);
1445 entry = pmd_modify(entry, newprot); 1510 entry = pmd_modify(entry, newprot);
1511 ret = HPAGE_PMD_NR;
1446 BUG_ON(pmd_write(entry)); 1512 BUG_ON(pmd_write(entry));
1447 } else { 1513 } else {
1448 struct page *page = pmd_page(*pmd); 1514 struct page *page = pmd_page(*pmd);
1449 1515
1450 /* only check non-shared pages */ 1516 /*
1451 if (page_mapcount(page) == 1 && 1517 * Do not trap faults against the zero page. The
1518 * read-only data is likely to be read-cached on the
1519 * local CPU cache and it is less useful to know about
1520 * local vs remote hits on the zero page.
1521 */
1522 if (!is_huge_zero_page(page) &&
1452 !pmd_numa(*pmd)) { 1523 !pmd_numa(*pmd)) {
1524 entry = pmdp_get_and_clear(mm, addr, pmd);
1453 entry = pmd_mknuma(entry); 1525 entry = pmd_mknuma(entry);
1526 ret = HPAGE_PMD_NR;
1454 } 1527 }
1455 } 1528 }
1456 set_pmd_at(mm, addr, pmd, entry); 1529
1457 spin_unlock(&vma->vm_mm->page_table_lock); 1530 /* Set PMD if cleared earlier */
1458 ret = 1; 1531 if (ret == HPAGE_PMD_NR)
1532 set_pmd_at(mm, addr, pmd, entry);
1533
1534 spin_unlock(ptl);
1459 } 1535 }
1460 1536
1461 return ret; 1537 return ret;
@@ -1468,12 +1544,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1468 * Note that if it returns 1, this routine returns without unlocking page 1544 * Note that if it returns 1, this routine returns without unlocking page
1469 * table locks. So callers must unlock them. 1545 * table locks. So callers must unlock them.
1470 */ 1546 */
1471int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 1547int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
1548 spinlock_t **ptl)
1472{ 1549{
1473 spin_lock(&vma->vm_mm->page_table_lock); 1550 *ptl = pmd_lock(vma->vm_mm, pmd);
1474 if (likely(pmd_trans_huge(*pmd))) { 1551 if (likely(pmd_trans_huge(*pmd))) {
1475 if (unlikely(pmd_trans_splitting(*pmd))) { 1552 if (unlikely(pmd_trans_splitting(*pmd))) {
1476 spin_unlock(&vma->vm_mm->page_table_lock); 1553 spin_unlock(*ptl);
1477 wait_split_huge_page(vma->anon_vma, pmd); 1554 wait_split_huge_page(vma->anon_vma, pmd);
1478 return -1; 1555 return -1;
1479 } else { 1556 } else {
@@ -1482,27 +1559,37 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1482 return 1; 1559 return 1;
1483 } 1560 }
1484 } 1561 }
1485 spin_unlock(&vma->vm_mm->page_table_lock); 1562 spin_unlock(*ptl);
1486 return 0; 1563 return 0;
1487} 1564}
1488 1565
1566/*
1567 * This function returns whether a given @page is mapped onto the @address
1568 * in the virtual space of @mm.
1569 *
1570 * When it's true, this function returns *pmd with holding the page table lock
1571 * and passing it back to the caller via @ptl.
1572 * If it's false, returns NULL without holding the page table lock.
1573 */
1489pmd_t *page_check_address_pmd(struct page *page, 1574pmd_t *page_check_address_pmd(struct page *page,
1490 struct mm_struct *mm, 1575 struct mm_struct *mm,
1491 unsigned long address, 1576 unsigned long address,
1492 enum page_check_address_pmd_flag flag) 1577 enum page_check_address_pmd_flag flag,
1578 spinlock_t **ptl)
1493{ 1579{
1494 pmd_t *pmd, *ret = NULL; 1580 pmd_t *pmd;
1495 1581
1496 if (address & ~HPAGE_PMD_MASK) 1582 if (address & ~HPAGE_PMD_MASK)
1497 goto out; 1583 return NULL;
1498 1584
1499 pmd = mm_find_pmd(mm, address); 1585 pmd = mm_find_pmd(mm, address);
1500 if (!pmd) 1586 if (!pmd)
1501 goto out; 1587 return NULL;
1588 *ptl = pmd_lock(mm, pmd);
1502 if (pmd_none(*pmd)) 1589 if (pmd_none(*pmd))
1503 goto out; 1590 goto unlock;
1504 if (pmd_page(*pmd) != page) 1591 if (pmd_page(*pmd) != page)
1505 goto out; 1592 goto unlock;
1506 /* 1593 /*
1507 * split_vma() may create temporary aliased mappings. There is 1594 * split_vma() may create temporary aliased mappings. There is
1508 * no risk as long as all huge pmd are found and have their 1595 * no risk as long as all huge pmd are found and have their
@@ -1512,14 +1599,15 @@ pmd_t *page_check_address_pmd(struct page *page,
1512 */ 1599 */
1513 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && 1600 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
1514 pmd_trans_splitting(*pmd)) 1601 pmd_trans_splitting(*pmd))
1515 goto out; 1602 goto unlock;
1516 if (pmd_trans_huge(*pmd)) { 1603 if (pmd_trans_huge(*pmd)) {
1517 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && 1604 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
1518 !pmd_trans_splitting(*pmd)); 1605 !pmd_trans_splitting(*pmd));
1519 ret = pmd; 1606 return pmd;
1520 } 1607 }
1521out: 1608unlock:
1522 return ret; 1609 spin_unlock(*ptl);
1610 return NULL;
1523} 1611}
1524 1612
1525static int __split_huge_page_splitting(struct page *page, 1613static int __split_huge_page_splitting(struct page *page,
@@ -1527,6 +1615,7 @@ static int __split_huge_page_splitting(struct page *page,
1527 unsigned long address) 1615 unsigned long address)
1528{ 1616{
1529 struct mm_struct *mm = vma->vm_mm; 1617 struct mm_struct *mm = vma->vm_mm;
1618 spinlock_t *ptl;
1530 pmd_t *pmd; 1619 pmd_t *pmd;
1531 int ret = 0; 1620 int ret = 0;
1532 /* For mmu_notifiers */ 1621 /* For mmu_notifiers */
@@ -1534,9 +1623,8 @@ static int __split_huge_page_splitting(struct page *page,
1534 const unsigned long mmun_end = address + HPAGE_PMD_SIZE; 1623 const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
1535 1624
1536 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1625 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1537 spin_lock(&mm->page_table_lock);
1538 pmd = page_check_address_pmd(page, mm, address, 1626 pmd = page_check_address_pmd(page, mm, address,
1539 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); 1627 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
1540 if (pmd) { 1628 if (pmd) {
1541 /* 1629 /*
1542 * We can't temporarily set the pmd to null in order 1630 * We can't temporarily set the pmd to null in order
@@ -1547,8 +1635,8 @@ static int __split_huge_page_splitting(struct page *page,
1547 */ 1635 */
1548 pmdp_splitting_flush(vma, address, pmd); 1636 pmdp_splitting_flush(vma, address, pmd);
1549 ret = 1; 1637 ret = 1;
1638 spin_unlock(ptl);
1550 } 1639 }
1551 spin_unlock(&mm->page_table_lock);
1552 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1640 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1553 1641
1554 return ret; 1642 return ret;
@@ -1636,7 +1724,7 @@ static void __split_huge_page_refcount(struct page *page,
1636 page_tail->mapping = page->mapping; 1724 page_tail->mapping = page->mapping;
1637 1725
1638 page_tail->index = page->index + i; 1726 page_tail->index = page->index + i;
1639 page_nid_xchg_last(page_tail, page_nid_last(page)); 1727 page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
1640 1728
1641 BUG_ON(!PageAnon(page_tail)); 1729 BUG_ON(!PageAnon(page_tail));
1642 BUG_ON(!PageUptodate(page_tail)); 1730 BUG_ON(!PageUptodate(page_tail));
@@ -1679,14 +1767,14 @@ static int __split_huge_page_map(struct page *page,
1679 unsigned long address) 1767 unsigned long address)
1680{ 1768{
1681 struct mm_struct *mm = vma->vm_mm; 1769 struct mm_struct *mm = vma->vm_mm;
1770 spinlock_t *ptl;
1682 pmd_t *pmd, _pmd; 1771 pmd_t *pmd, _pmd;
1683 int ret = 0, i; 1772 int ret = 0, i;
1684 pgtable_t pgtable; 1773 pgtable_t pgtable;
1685 unsigned long haddr; 1774 unsigned long haddr;
1686 1775
1687 spin_lock(&mm->page_table_lock);
1688 pmd = page_check_address_pmd(page, mm, address, 1776 pmd = page_check_address_pmd(page, mm, address,
1689 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); 1777 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
1690 if (pmd) { 1778 if (pmd) {
1691 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1779 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1692 pmd_populate(mm, &_pmd, pgtable); 1780 pmd_populate(mm, &_pmd, pgtable);
@@ -1741,8 +1829,8 @@ static int __split_huge_page_map(struct page *page,
1741 pmdp_invalidate(vma, address, pmd); 1829 pmdp_invalidate(vma, address, pmd);
1742 pmd_populate(mm, pmd, pgtable); 1830 pmd_populate(mm, pmd, pgtable);
1743 ret = 1; 1831 ret = 1;
1832 spin_unlock(ptl);
1744 } 1833 }
1745 spin_unlock(&mm->page_table_lock);
1746 1834
1747 return ret; 1835 return ret;
1748} 1836}
@@ -2139,7 +2227,34 @@ static void khugepaged_alloc_sleep(void)
2139 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 2227 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2140} 2228}
2141 2229
2230static int khugepaged_node_load[MAX_NUMNODES];
2231
2142#ifdef CONFIG_NUMA 2232#ifdef CONFIG_NUMA
2233static int khugepaged_find_target_node(void)
2234{
2235 static int last_khugepaged_target_node = NUMA_NO_NODE;
2236 int nid, target_node = 0, max_value = 0;
2237
2238 /* find first node with max normal pages hit */
2239 for (nid = 0; nid < MAX_NUMNODES; nid++)
2240 if (khugepaged_node_load[nid] > max_value) {
2241 max_value = khugepaged_node_load[nid];
2242 target_node = nid;
2243 }
2244
2245 /* do some balance if several nodes have the same hit record */
2246 if (target_node <= last_khugepaged_target_node)
2247 for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
2248 nid++)
2249 if (max_value == khugepaged_node_load[nid]) {
2250 target_node = nid;
2251 break;
2252 }
2253
2254 last_khugepaged_target_node = target_node;
2255 return target_node;
2256}
2257
2143static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2258static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2144{ 2259{
2145 if (IS_ERR(*hpage)) { 2260 if (IS_ERR(*hpage)) {
@@ -2173,9 +2288,8 @@ static struct page
2173 * mmap_sem in read mode is good idea also to allow greater 2288 * mmap_sem in read mode is good idea also to allow greater
2174 * scalability. 2289 * scalability.
2175 */ 2290 */
2176 *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, 2291 *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
2177 node, __GFP_OTHER_NODE); 2292 khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
2178
2179 /* 2293 /*
2180 * After allocating the hugepage, release the mmap_sem read lock in 2294 * After allocating the hugepage, release the mmap_sem read lock in
2181 * preparation for taking it in write mode. 2295 * preparation for taking it in write mode.
@@ -2191,6 +2305,17 @@ static struct page
2191 return *hpage; 2305 return *hpage;
2192} 2306}
2193#else 2307#else
2308static int khugepaged_find_target_node(void)
2309{
2310 return 0;
2311}
2312
2313static inline struct page *alloc_hugepage(int defrag)
2314{
2315 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
2316 HPAGE_PMD_ORDER);
2317}
2318
2194static struct page *khugepaged_alloc_hugepage(bool *wait) 2319static struct page *khugepaged_alloc_hugepage(bool *wait)
2195{ 2320{
2196 struct page *hpage; 2321 struct page *hpage;
@@ -2257,7 +2382,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2257 pte_t *pte; 2382 pte_t *pte;
2258 pgtable_t pgtable; 2383 pgtable_t pgtable;
2259 struct page *new_page; 2384 struct page *new_page;
2260 spinlock_t *ptl; 2385 spinlock_t *pmd_ptl, *pte_ptl;
2261 int isolated; 2386 int isolated;
2262 unsigned long hstart, hend; 2387 unsigned long hstart, hend;
2263 unsigned long mmun_start; /* For mmu_notifiers */ 2388 unsigned long mmun_start; /* For mmu_notifiers */
@@ -2300,12 +2425,12 @@ static void collapse_huge_page(struct mm_struct *mm,
2300 anon_vma_lock_write(vma->anon_vma); 2425 anon_vma_lock_write(vma->anon_vma);
2301 2426
2302 pte = pte_offset_map(pmd, address); 2427 pte = pte_offset_map(pmd, address);
2303 ptl = pte_lockptr(mm, pmd); 2428 pte_ptl = pte_lockptr(mm, pmd);
2304 2429
2305 mmun_start = address; 2430 mmun_start = address;
2306 mmun_end = address + HPAGE_PMD_SIZE; 2431 mmun_end = address + HPAGE_PMD_SIZE;
2307 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2432 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2308 spin_lock(&mm->page_table_lock); /* probably unnecessary */ 2433 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
2309 /* 2434 /*
2310 * After this gup_fast can't run anymore. This also removes 2435 * After this gup_fast can't run anymore. This also removes
2311 * any huge TLB entry from the CPU so we won't allow 2436 * any huge TLB entry from the CPU so we won't allow
@@ -2313,16 +2438,16 @@ static void collapse_huge_page(struct mm_struct *mm,
2313 * to avoid the risk of CPU bugs in that area. 2438 * to avoid the risk of CPU bugs in that area.
2314 */ 2439 */
2315 _pmd = pmdp_clear_flush(vma, address, pmd); 2440 _pmd = pmdp_clear_flush(vma, address, pmd);
2316 spin_unlock(&mm->page_table_lock); 2441 spin_unlock(pmd_ptl);
2317 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2442 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2318 2443
2319 spin_lock(ptl); 2444 spin_lock(pte_ptl);
2320 isolated = __collapse_huge_page_isolate(vma, address, pte); 2445 isolated = __collapse_huge_page_isolate(vma, address, pte);
2321 spin_unlock(ptl); 2446 spin_unlock(pte_ptl);
2322 2447
2323 if (unlikely(!isolated)) { 2448 if (unlikely(!isolated)) {
2324 pte_unmap(pte); 2449 pte_unmap(pte);
2325 spin_lock(&mm->page_table_lock); 2450 spin_lock(pmd_ptl);
2326 BUG_ON(!pmd_none(*pmd)); 2451 BUG_ON(!pmd_none(*pmd));
2327 /* 2452 /*
2328 * We can only use set_pmd_at when establishing 2453 * We can only use set_pmd_at when establishing
@@ -2330,7 +2455,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2330 * points to regular pagetables. Use pmd_populate for that 2455 * points to regular pagetables. Use pmd_populate for that
2331 */ 2456 */
2332 pmd_populate(mm, pmd, pmd_pgtable(_pmd)); 2457 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
2333 spin_unlock(&mm->page_table_lock); 2458 spin_unlock(pmd_ptl);
2334 anon_vma_unlock_write(vma->anon_vma); 2459 anon_vma_unlock_write(vma->anon_vma);
2335 goto out; 2460 goto out;
2336 } 2461 }
@@ -2341,7 +2466,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2341 */ 2466 */
2342 anon_vma_unlock_write(vma->anon_vma); 2467 anon_vma_unlock_write(vma->anon_vma);
2343 2468
2344 __collapse_huge_page_copy(pte, new_page, vma, address, ptl); 2469 __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
2345 pte_unmap(pte); 2470 pte_unmap(pte);
2346 __SetPageUptodate(new_page); 2471 __SetPageUptodate(new_page);
2347 pgtable = pmd_pgtable(_pmd); 2472 pgtable = pmd_pgtable(_pmd);
@@ -2356,13 +2481,13 @@ static void collapse_huge_page(struct mm_struct *mm,
2356 */ 2481 */
2357 smp_wmb(); 2482 smp_wmb();
2358 2483
2359 spin_lock(&mm->page_table_lock); 2484 spin_lock(pmd_ptl);
2360 BUG_ON(!pmd_none(*pmd)); 2485 BUG_ON(!pmd_none(*pmd));
2361 page_add_new_anon_rmap(new_page, vma, address); 2486 page_add_new_anon_rmap(new_page, vma, address);
2362 pgtable_trans_huge_deposit(mm, pmd, pgtable); 2487 pgtable_trans_huge_deposit(mm, pmd, pgtable);
2363 set_pmd_at(mm, address, pmd, _pmd); 2488 set_pmd_at(mm, address, pmd, _pmd);
2364 update_mmu_cache_pmd(vma, address, pmd); 2489 update_mmu_cache_pmd(vma, address, pmd);
2365 spin_unlock(&mm->page_table_lock); 2490 spin_unlock(pmd_ptl);
2366 2491
2367 *hpage = NULL; 2492 *hpage = NULL;
2368 2493
@@ -2397,6 +2522,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2397 if (pmd_trans_huge(*pmd)) 2522 if (pmd_trans_huge(*pmd))
2398 goto out; 2523 goto out;
2399 2524
2525 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
2400 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2526 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2401 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 2527 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
2402 _pte++, _address += PAGE_SIZE) { 2528 _pte++, _address += PAGE_SIZE) {
@@ -2413,12 +2539,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2413 if (unlikely(!page)) 2539 if (unlikely(!page))
2414 goto out_unmap; 2540 goto out_unmap;
2415 /* 2541 /*
2416 * Chose the node of the first page. This could 2542 * Record which node the original page is from and save this
2417 * be more sophisticated and look at more pages, 2543 * information to khugepaged_node_load[].
2418 * but isn't for now. 2544 * Khupaged will allocate hugepage from the node has the max
2545 * hit record.
2419 */ 2546 */
2420 if (node == NUMA_NO_NODE) 2547 node = page_to_nid(page);
2421 node = page_to_nid(page); 2548 khugepaged_node_load[node]++;
2422 VM_BUG_ON(PageCompound(page)); 2549 VM_BUG_ON(PageCompound(page));
2423 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2550 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
2424 goto out_unmap; 2551 goto out_unmap;
@@ -2433,9 +2560,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2433 ret = 1; 2560 ret = 1;
2434out_unmap: 2561out_unmap:
2435 pte_unmap_unlock(pte, ptl); 2562 pte_unmap_unlock(pte, ptl);
2436 if (ret) 2563 if (ret) {
2564 node = khugepaged_find_target_node();
2437 /* collapse_huge_page will return with the mmap_sem released */ 2565 /* collapse_huge_page will return with the mmap_sem released */
2438 collapse_huge_page(mm, address, hpage, vma, node); 2566 collapse_huge_page(mm, address, hpage, vma, node);
2567 }
2439out: 2568out:
2440 return ret; 2569 return ret;
2441} 2570}
@@ -2687,6 +2816,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2687void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, 2816void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2688 pmd_t *pmd) 2817 pmd_t *pmd)
2689{ 2818{
2819 spinlock_t *ptl;
2690 struct page *page; 2820 struct page *page;
2691 struct mm_struct *mm = vma->vm_mm; 2821 struct mm_struct *mm = vma->vm_mm;
2692 unsigned long haddr = address & HPAGE_PMD_MASK; 2822 unsigned long haddr = address & HPAGE_PMD_MASK;
@@ -2697,29 +2827,37 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2697 2827
2698 mmun_start = haddr; 2828 mmun_start = haddr;
2699 mmun_end = haddr + HPAGE_PMD_SIZE; 2829 mmun_end = haddr + HPAGE_PMD_SIZE;
2830again:
2700 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2831 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2701 spin_lock(&mm->page_table_lock); 2832 ptl = pmd_lock(mm, pmd);
2702 if (unlikely(!pmd_trans_huge(*pmd))) { 2833 if (unlikely(!pmd_trans_huge(*pmd))) {
2703 spin_unlock(&mm->page_table_lock); 2834 spin_unlock(ptl);
2704 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2835 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2705 return; 2836 return;
2706 } 2837 }
2707 if (is_huge_zero_pmd(*pmd)) { 2838 if (is_huge_zero_pmd(*pmd)) {
2708 __split_huge_zero_page_pmd(vma, haddr, pmd); 2839 __split_huge_zero_page_pmd(vma, haddr, pmd);
2709 spin_unlock(&mm->page_table_lock); 2840 spin_unlock(ptl);
2710 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2841 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2711 return; 2842 return;
2712 } 2843 }
2713 page = pmd_page(*pmd); 2844 page = pmd_page(*pmd);
2714 VM_BUG_ON(!page_count(page)); 2845 VM_BUG_ON(!page_count(page));
2715 get_page(page); 2846 get_page(page);
2716 spin_unlock(&mm->page_table_lock); 2847 spin_unlock(ptl);
2717 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2848 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2718 2849
2719 split_huge_page(page); 2850 split_huge_page(page);
2720 2851
2721 put_page(page); 2852 put_page(page);
2722 BUG_ON(pmd_trans_huge(*pmd)); 2853
2854 /*
2855 * We don't always have down_write of mmap_sem here: a racing
2856 * do_huge_pmd_wp_page() might have copied-on-write to another
2857 * huge page before our split_huge_page() got the anon_vma lock.
2858 */
2859 if (unlikely(pmd_trans_huge(*pmd)))
2860 goto again;
2723} 2861}
2724 2862
2725void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, 2863void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b49579c7f2a5..7d57af21f49e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -653,6 +653,7 @@ static void free_huge_page(struct page *page)
653 BUG_ON(page_count(page)); 653 BUG_ON(page_count(page));
654 BUG_ON(page_mapcount(page)); 654 BUG_ON(page_mapcount(page));
655 restore_reserve = PagePrivate(page); 655 restore_reserve = PagePrivate(page);
656 ClearPagePrivate(page);
656 657
657 spin_lock(&hugetlb_lock); 658 spin_lock(&hugetlb_lock);
658 hugetlb_cgroup_uncharge_page(hstate_index(h), 659 hugetlb_cgroup_uncharge_page(hstate_index(h),
@@ -695,8 +696,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
695 /* we rely on prep_new_huge_page to set the destructor */ 696 /* we rely on prep_new_huge_page to set the destructor */
696 set_compound_order(page, order); 697 set_compound_order(page, order);
697 __SetPageHead(page); 698 __SetPageHead(page);
699 __ClearPageReserved(page);
698 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 700 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
699 __SetPageTail(p); 701 __SetPageTail(p);
702 /*
703 * For gigantic hugepages allocated through bootmem at
704 * boot, it's safer to be consistent with the not-gigantic
705 * hugepages and clear the PG_reserved bit from all tail pages
706 * too. Otherwse drivers using get_user_pages() to access tail
707 * pages may get the reference counting wrong if they see
708 * PG_reserved set on a tail page (despite the head page not
709 * having PG_reserved set). Enforcing this consistency between
710 * head and tail pages allows drivers to optimize away a check
711 * on the head page when they need know if put_page() is needed
712 * after get_user_pages().
713 */
714 __ClearPageReserved(p);
700 set_page_count(p, 0); 715 set_page_count(p, 0);
701 p->first_page = page; 716 p->first_page = page;
702 } 717 }
@@ -1329,9 +1344,9 @@ static void __init gather_bootmem_prealloc(void)
1329#else 1344#else
1330 page = virt_to_page(m); 1345 page = virt_to_page(m);
1331#endif 1346#endif
1332 __ClearPageReserved(page);
1333 WARN_ON(page_count(page) != 1); 1347 WARN_ON(page_count(page) != 1);
1334 prep_compound_huge_page(page, h->order); 1348 prep_compound_huge_page(page, h->order);
1349 WARN_ON(PageReserved(page));
1335 prep_new_huge_page(h, page, page_to_nid(page)); 1350 prep_new_huge_page(h, page, page_to_nid(page));
1336 /* 1351 /*
1337 * If we had gigantic hugepages allocated at boot time, we need 1352 * If we had gigantic hugepages allocated at boot time, we need
@@ -2361,6 +2376,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2361 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 2376 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
2362 2377
2363 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { 2378 for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
2379 spinlock_t *src_ptl, *dst_ptl;
2364 src_pte = huge_pte_offset(src, addr); 2380 src_pte = huge_pte_offset(src, addr);
2365 if (!src_pte) 2381 if (!src_pte)
2366 continue; 2382 continue;
@@ -2372,8 +2388,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2372 if (dst_pte == src_pte) 2388 if (dst_pte == src_pte)
2373 continue; 2389 continue;
2374 2390
2375 spin_lock(&dst->page_table_lock); 2391 dst_ptl = huge_pte_lock(h, dst, dst_pte);
2376 spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); 2392 src_ptl = huge_pte_lockptr(h, src, src_pte);
2393 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
2377 if (!huge_pte_none(huge_ptep_get(src_pte))) { 2394 if (!huge_pte_none(huge_ptep_get(src_pte))) {
2378 if (cow) 2395 if (cow)
2379 huge_ptep_set_wrprotect(src, addr, src_pte); 2396 huge_ptep_set_wrprotect(src, addr, src_pte);
@@ -2383,8 +2400,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2383 page_dup_rmap(ptepage); 2400 page_dup_rmap(ptepage);
2384 set_huge_pte_at(dst, addr, dst_pte, entry); 2401 set_huge_pte_at(dst, addr, dst_pte, entry);
2385 } 2402 }
2386 spin_unlock(&src->page_table_lock); 2403 spin_unlock(src_ptl);
2387 spin_unlock(&dst->page_table_lock); 2404 spin_unlock(dst_ptl);
2388 } 2405 }
2389 return 0; 2406 return 0;
2390 2407
@@ -2427,6 +2444,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2427 unsigned long address; 2444 unsigned long address;
2428 pte_t *ptep; 2445 pte_t *ptep;
2429 pte_t pte; 2446 pte_t pte;
2447 spinlock_t *ptl;
2430 struct page *page; 2448 struct page *page;
2431 struct hstate *h = hstate_vma(vma); 2449 struct hstate *h = hstate_vma(vma);
2432 unsigned long sz = huge_page_size(h); 2450 unsigned long sz = huge_page_size(h);
@@ -2440,25 +2458,25 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2440 tlb_start_vma(tlb, vma); 2458 tlb_start_vma(tlb, vma);
2441 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2459 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2442again: 2460again:
2443 spin_lock(&mm->page_table_lock);
2444 for (address = start; address < end; address += sz) { 2461 for (address = start; address < end; address += sz) {
2445 ptep = huge_pte_offset(mm, address); 2462 ptep = huge_pte_offset(mm, address);
2446 if (!ptep) 2463 if (!ptep)
2447 continue; 2464 continue;
2448 2465
2466 ptl = huge_pte_lock(h, mm, ptep);
2449 if (huge_pmd_unshare(mm, &address, ptep)) 2467 if (huge_pmd_unshare(mm, &address, ptep))
2450 continue; 2468 goto unlock;
2451 2469
2452 pte = huge_ptep_get(ptep); 2470 pte = huge_ptep_get(ptep);
2453 if (huge_pte_none(pte)) 2471 if (huge_pte_none(pte))
2454 continue; 2472 goto unlock;
2455 2473
2456 /* 2474 /*
2457 * HWPoisoned hugepage is already unmapped and dropped reference 2475 * HWPoisoned hugepage is already unmapped and dropped reference
2458 */ 2476 */
2459 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 2477 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
2460 huge_pte_clear(mm, address, ptep); 2478 huge_pte_clear(mm, address, ptep);
2461 continue; 2479 goto unlock;
2462 } 2480 }
2463 2481
2464 page = pte_page(pte); 2482 page = pte_page(pte);
@@ -2469,7 +2487,7 @@ again:
2469 */ 2487 */
2470 if (ref_page) { 2488 if (ref_page) {
2471 if (page != ref_page) 2489 if (page != ref_page)
2472 continue; 2490 goto unlock;
2473 2491
2474 /* 2492 /*
2475 * Mark the VMA as having unmapped its page so that 2493 * Mark the VMA as having unmapped its page so that
@@ -2486,13 +2504,18 @@ again:
2486 2504
2487 page_remove_rmap(page); 2505 page_remove_rmap(page);
2488 force_flush = !__tlb_remove_page(tlb, page); 2506 force_flush = !__tlb_remove_page(tlb, page);
2489 if (force_flush) 2507 if (force_flush) {
2508 spin_unlock(ptl);
2490 break; 2509 break;
2510 }
2491 /* Bail out after unmapping reference page if supplied */ 2511 /* Bail out after unmapping reference page if supplied */
2492 if (ref_page) 2512 if (ref_page) {
2513 spin_unlock(ptl);
2493 break; 2514 break;
2515 }
2516unlock:
2517 spin_unlock(ptl);
2494 } 2518 }
2495 spin_unlock(&mm->page_table_lock);
2496 /* 2519 /*
2497 * mmu_gather ran out of room to batch pages, we break out of 2520 * mmu_gather ran out of room to batch pages, we break out of
2498 * the PTE lock to avoid doing the potential expensive TLB invalidate 2521 * the PTE lock to avoid doing the potential expensive TLB invalidate
@@ -2598,7 +2621,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2598 */ 2621 */
2599static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 2622static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2600 unsigned long address, pte_t *ptep, pte_t pte, 2623 unsigned long address, pte_t *ptep, pte_t pte,
2601 struct page *pagecache_page) 2624 struct page *pagecache_page, spinlock_t *ptl)
2602{ 2625{
2603 struct hstate *h = hstate_vma(vma); 2626 struct hstate *h = hstate_vma(vma);
2604 struct page *old_page, *new_page; 2627 struct page *old_page, *new_page;
@@ -2632,8 +2655,8 @@ retry_avoidcopy:
2632 2655
2633 page_cache_get(old_page); 2656 page_cache_get(old_page);
2634 2657
2635 /* Drop page_table_lock as buddy allocator may be called */ 2658 /* Drop page table lock as buddy allocator may be called */
2636 spin_unlock(&mm->page_table_lock); 2659 spin_unlock(ptl);
2637 new_page = alloc_huge_page(vma, address, outside_reserve); 2660 new_page = alloc_huge_page(vma, address, outside_reserve);
2638 2661
2639 if (IS_ERR(new_page)) { 2662 if (IS_ERR(new_page)) {
@@ -2651,13 +2674,13 @@ retry_avoidcopy:
2651 BUG_ON(huge_pte_none(pte)); 2674 BUG_ON(huge_pte_none(pte));
2652 if (unmap_ref_private(mm, vma, old_page, address)) { 2675 if (unmap_ref_private(mm, vma, old_page, address)) {
2653 BUG_ON(huge_pte_none(pte)); 2676 BUG_ON(huge_pte_none(pte));
2654 spin_lock(&mm->page_table_lock); 2677 spin_lock(ptl);
2655 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2678 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2656 if (likely(pte_same(huge_ptep_get(ptep), pte))) 2679 if (likely(pte_same(huge_ptep_get(ptep), pte)))
2657 goto retry_avoidcopy; 2680 goto retry_avoidcopy;
2658 /* 2681 /*
2659 * race occurs while re-acquiring page_table_lock, and 2682 * race occurs while re-acquiring page table
2660 * our job is done. 2683 * lock, and our job is done.
2661 */ 2684 */
2662 return 0; 2685 return 0;
2663 } 2686 }
@@ -2665,7 +2688,7 @@ retry_avoidcopy:
2665 } 2688 }
2666 2689
2667 /* Caller expects lock to be held */ 2690 /* Caller expects lock to be held */
2668 spin_lock(&mm->page_table_lock); 2691 spin_lock(ptl);
2669 if (err == -ENOMEM) 2692 if (err == -ENOMEM)
2670 return VM_FAULT_OOM; 2693 return VM_FAULT_OOM;
2671 else 2694 else
@@ -2680,7 +2703,7 @@ retry_avoidcopy:
2680 page_cache_release(new_page); 2703 page_cache_release(new_page);
2681 page_cache_release(old_page); 2704 page_cache_release(old_page);
2682 /* Caller expects lock to be held */ 2705 /* Caller expects lock to be held */
2683 spin_lock(&mm->page_table_lock); 2706 spin_lock(ptl);
2684 return VM_FAULT_OOM; 2707 return VM_FAULT_OOM;
2685 } 2708 }
2686 2709
@@ -2692,10 +2715,10 @@ retry_avoidcopy:
2692 mmun_end = mmun_start + huge_page_size(h); 2715 mmun_end = mmun_start + huge_page_size(h);
2693 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2716 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2694 /* 2717 /*
2695 * Retake the page_table_lock to check for racing updates 2718 * Retake the page table lock to check for racing updates
2696 * before the page tables are altered 2719 * before the page tables are altered
2697 */ 2720 */
2698 spin_lock(&mm->page_table_lock); 2721 spin_lock(ptl);
2699 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2722 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2700 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2723 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2701 ClearPagePrivate(new_page); 2724 ClearPagePrivate(new_page);
@@ -2709,13 +2732,13 @@ retry_avoidcopy:
2709 /* Make the old page be freed below */ 2732 /* Make the old page be freed below */
2710 new_page = old_page; 2733 new_page = old_page;
2711 } 2734 }
2712 spin_unlock(&mm->page_table_lock); 2735 spin_unlock(ptl);
2713 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2736 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2714 page_cache_release(new_page); 2737 page_cache_release(new_page);
2715 page_cache_release(old_page); 2738 page_cache_release(old_page);
2716 2739
2717 /* Caller expects lock to be held */ 2740 /* Caller expects lock to be held */
2718 spin_lock(&mm->page_table_lock); 2741 spin_lock(ptl);
2719 return 0; 2742 return 0;
2720} 2743}
2721 2744
@@ -2763,6 +2786,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2763 struct page *page; 2786 struct page *page;
2764 struct address_space *mapping; 2787 struct address_space *mapping;
2765 pte_t new_pte; 2788 pte_t new_pte;
2789 spinlock_t *ptl;
2766 2790
2767 /* 2791 /*
2768 * Currently, we are forced to kill the process in the event the 2792 * Currently, we are forced to kill the process in the event the
@@ -2849,7 +2873,8 @@ retry:
2849 goto backout_unlocked; 2873 goto backout_unlocked;
2850 } 2874 }
2851 2875
2852 spin_lock(&mm->page_table_lock); 2876 ptl = huge_pte_lockptr(h, mm, ptep);
2877 spin_lock(ptl);
2853 size = i_size_read(mapping->host) >> huge_page_shift(h); 2878 size = i_size_read(mapping->host) >> huge_page_shift(h);
2854 if (idx >= size) 2879 if (idx >= size)
2855 goto backout; 2880 goto backout;
@@ -2870,16 +2895,16 @@ retry:
2870 2895
2871 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 2896 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
2872 /* Optimization, do the COW without a second fault */ 2897 /* Optimization, do the COW without a second fault */
2873 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); 2898 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
2874 } 2899 }
2875 2900
2876 spin_unlock(&mm->page_table_lock); 2901 spin_unlock(ptl);
2877 unlock_page(page); 2902 unlock_page(page);
2878out: 2903out:
2879 return ret; 2904 return ret;
2880 2905
2881backout: 2906backout:
2882 spin_unlock(&mm->page_table_lock); 2907 spin_unlock(ptl);
2883backout_unlocked: 2908backout_unlocked:
2884 unlock_page(page); 2909 unlock_page(page);
2885 put_page(page); 2910 put_page(page);
@@ -2891,6 +2916,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2891{ 2916{
2892 pte_t *ptep; 2917 pte_t *ptep;
2893 pte_t entry; 2918 pte_t entry;
2919 spinlock_t *ptl;
2894 int ret; 2920 int ret;
2895 struct page *page = NULL; 2921 struct page *page = NULL;
2896 struct page *pagecache_page = NULL; 2922 struct page *pagecache_page = NULL;
@@ -2903,7 +2929,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2903 if (ptep) { 2929 if (ptep) {
2904 entry = huge_ptep_get(ptep); 2930 entry = huge_ptep_get(ptep);
2905 if (unlikely(is_hugetlb_entry_migration(entry))) { 2931 if (unlikely(is_hugetlb_entry_migration(entry))) {
2906 migration_entry_wait_huge(mm, ptep); 2932 migration_entry_wait_huge(vma, mm, ptep);
2907 return 0; 2933 return 0;
2908 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2934 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2909 return VM_FAULT_HWPOISON_LARGE | 2935 return VM_FAULT_HWPOISON_LARGE |
@@ -2959,17 +2985,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2959 if (page != pagecache_page) 2985 if (page != pagecache_page)
2960 lock_page(page); 2986 lock_page(page);
2961 2987
2962 spin_lock(&mm->page_table_lock); 2988 ptl = huge_pte_lockptr(h, mm, ptep);
2989 spin_lock(ptl);
2963 /* Check for a racing update before calling hugetlb_cow */ 2990 /* Check for a racing update before calling hugetlb_cow */
2964 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 2991 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
2965 goto out_page_table_lock; 2992 goto out_ptl;
2966 2993
2967 2994
2968 if (flags & FAULT_FLAG_WRITE) { 2995 if (flags & FAULT_FLAG_WRITE) {
2969 if (!huge_pte_write(entry)) { 2996 if (!huge_pte_write(entry)) {
2970 ret = hugetlb_cow(mm, vma, address, ptep, entry, 2997 ret = hugetlb_cow(mm, vma, address, ptep, entry,
2971 pagecache_page); 2998 pagecache_page, ptl);
2972 goto out_page_table_lock; 2999 goto out_ptl;
2973 } 3000 }
2974 entry = huge_pte_mkdirty(entry); 3001 entry = huge_pte_mkdirty(entry);
2975 } 3002 }
@@ -2978,8 +3005,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2978 flags & FAULT_FLAG_WRITE)) 3005 flags & FAULT_FLAG_WRITE))
2979 update_mmu_cache(vma, address, ptep); 3006 update_mmu_cache(vma, address, ptep);
2980 3007
2981out_page_table_lock: 3008out_ptl:
2982 spin_unlock(&mm->page_table_lock); 3009 spin_unlock(ptl);
2983 3010
2984 if (pagecache_page) { 3011 if (pagecache_page) {
2985 unlock_page(pagecache_page); 3012 unlock_page(pagecache_page);
@@ -3005,9 +3032,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
3005 unsigned long remainder = *nr_pages; 3032 unsigned long remainder = *nr_pages;
3006 struct hstate *h = hstate_vma(vma); 3033 struct hstate *h = hstate_vma(vma);
3007 3034
3008 spin_lock(&mm->page_table_lock);
3009 while (vaddr < vma->vm_end && remainder) { 3035 while (vaddr < vma->vm_end && remainder) {
3010 pte_t *pte; 3036 pte_t *pte;
3037 spinlock_t *ptl = NULL;
3011 int absent; 3038 int absent;
3012 struct page *page; 3039 struct page *page;
3013 3040
@@ -3015,8 +3042,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
3015 * Some archs (sparc64, sh*) have multiple pte_ts to 3042 * Some archs (sparc64, sh*) have multiple pte_ts to
3016 * each hugepage. We have to make sure we get the 3043 * each hugepage. We have to make sure we get the
3017 * first, for the page indexing below to work. 3044 * first, for the page indexing below to work.
3045 *
3046 * Note that page table lock is not held when pte is null.
3018 */ 3047 */
3019 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 3048 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
3049 if (pte)
3050 ptl = huge_pte_lock(h, mm, pte);
3020 absent = !pte || huge_pte_none(huge_ptep_get(pte)); 3051 absent = !pte || huge_pte_none(huge_ptep_get(pte));
3021 3052
3022 /* 3053 /*
@@ -3028,6 +3059,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
3028 */ 3059 */
3029 if (absent && (flags & FOLL_DUMP) && 3060 if (absent && (flags & FOLL_DUMP) &&
3030 !hugetlbfs_pagecache_present(h, vma, vaddr)) { 3061 !hugetlbfs_pagecache_present(h, vma, vaddr)) {
3062 if (pte)
3063 spin_unlock(ptl);
3031 remainder = 0; 3064 remainder = 0;
3032 break; 3065 break;
3033 } 3066 }
@@ -3047,10 +3080,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
3047 !huge_pte_write(huge_ptep_get(pte)))) { 3080 !huge_pte_write(huge_ptep_get(pte)))) {
3048 int ret; 3081 int ret;
3049 3082
3050 spin_unlock(&mm->page_table_lock); 3083 if (pte)
3084 spin_unlock(ptl);
3051 ret = hugetlb_fault(mm, vma, vaddr, 3085 ret = hugetlb_fault(mm, vma, vaddr,
3052 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); 3086 (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
3053 spin_lock(&mm->page_table_lock);
3054 if (!(ret & VM_FAULT_ERROR)) 3087 if (!(ret & VM_FAULT_ERROR))
3055 continue; 3088 continue;
3056 3089
@@ -3081,8 +3114,8 @@ same_page:
3081 */ 3114 */
3082 goto same_page; 3115 goto same_page;
3083 } 3116 }
3117 spin_unlock(ptl);
3084 } 3118 }
3085 spin_unlock(&mm->page_table_lock);
3086 *nr_pages = remainder; 3119 *nr_pages = remainder;
3087 *position = vaddr; 3120 *position = vaddr;
3088 3121
@@ -3103,13 +3136,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3103 flush_cache_range(vma, address, end); 3136 flush_cache_range(vma, address, end);
3104 3137
3105 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); 3138 mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
3106 spin_lock(&mm->page_table_lock);
3107 for (; address < end; address += huge_page_size(h)) { 3139 for (; address < end; address += huge_page_size(h)) {
3140 spinlock_t *ptl;
3108 ptep = huge_pte_offset(mm, address); 3141 ptep = huge_pte_offset(mm, address);
3109 if (!ptep) 3142 if (!ptep)
3110 continue; 3143 continue;
3144 ptl = huge_pte_lock(h, mm, ptep);
3111 if (huge_pmd_unshare(mm, &address, ptep)) { 3145 if (huge_pmd_unshare(mm, &address, ptep)) {
3112 pages++; 3146 pages++;
3147 spin_unlock(ptl);
3113 continue; 3148 continue;
3114 } 3149 }
3115 if (!huge_pte_none(huge_ptep_get(ptep))) { 3150 if (!huge_pte_none(huge_ptep_get(ptep))) {
@@ -3119,8 +3154,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3119 set_huge_pte_at(mm, address, ptep, pte); 3154 set_huge_pte_at(mm, address, ptep, pte);
3120 pages++; 3155 pages++;
3121 } 3156 }
3157 spin_unlock(ptl);
3122 } 3158 }
3123 spin_unlock(&mm->page_table_lock);
3124 /* 3159 /*
3125 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare 3160 * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
3126 * may have cleared our pud entry and done put_page on the page table: 3161 * may have cleared our pud entry and done put_page on the page table:
@@ -3283,6 +3318,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3283 unsigned long saddr; 3318 unsigned long saddr;
3284 pte_t *spte = NULL; 3319 pte_t *spte = NULL;
3285 pte_t *pte; 3320 pte_t *pte;
3321 spinlock_t *ptl;
3286 3322
3287 if (!vma_shareable(vma, addr)) 3323 if (!vma_shareable(vma, addr))
3288 return (pte_t *)pmd_alloc(mm, pud, addr); 3324 return (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3305,13 +3341,14 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3305 if (!spte) 3341 if (!spte)
3306 goto out; 3342 goto out;
3307 3343
3308 spin_lock(&mm->page_table_lock); 3344 ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
3345 spin_lock(ptl);
3309 if (pud_none(*pud)) 3346 if (pud_none(*pud))
3310 pud_populate(mm, pud, 3347 pud_populate(mm, pud,
3311 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 3348 (pmd_t *)((unsigned long)spte & PAGE_MASK));
3312 else 3349 else
3313 put_page(virt_to_page(spte)); 3350 put_page(virt_to_page(spte));
3314 spin_unlock(&mm->page_table_lock); 3351 spin_unlock(ptl);
3315out: 3352out:
3316 pte = (pte_t *)pmd_alloc(mm, pud, addr); 3353 pte = (pte_t *)pmd_alloc(mm, pud, addr);
3317 mutex_unlock(&mapping->i_mmap_mutex); 3354 mutex_unlock(&mapping->i_mmap_mutex);
@@ -3325,7 +3362,7 @@ out:
3325 * indicated by page_count > 1, unmap is achieved by clearing pud and 3362 * indicated by page_count > 1, unmap is achieved by clearing pud and
3326 * decrementing the ref count. If count == 1, the pte page is not shared. 3363 * decrementing the ref count. If count == 1, the pte page is not shared.
3327 * 3364 *
3328 * called with vma->vm_mm->page_table_lock held. 3365 * called with page table lock held.
3329 * 3366 *
3330 * returns: 1 successfully unmapped a shared pte page 3367 * returns: 1 successfully unmapped a shared pte page
3331 * 0 the underlying pte page is not shared, or it is the last user 3368 * 0 the underlying pte page is not shared, or it is the last user
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index afc2daa91c60..4c84678371eb 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -20,8 +20,6 @@ static int hwpoison_inject(void *data, u64 val)
20 if (!capable(CAP_SYS_ADMIN)) 20 if (!capable(CAP_SYS_ADMIN))
21 return -EPERM; 21 return -EPERM;
22 22
23 if (!hwpoison_filter_enable)
24 goto inject;
25 if (!pfn_valid(pfn)) 23 if (!pfn_valid(pfn))
26 return -ENXIO; 24 return -ENXIO;
27 25
@@ -33,6 +31,9 @@ static int hwpoison_inject(void *data, u64 val)
33 if (!get_page_unless_zero(hpage)) 31 if (!get_page_unless_zero(hpage))
34 return 0; 32 return 0;
35 33
34 if (!hwpoison_filter_enable)
35 goto inject;
36
36 if (!PageLRU(p) && !PageHuge(p)) 37 if (!PageLRU(p) && !PageHuge(p))
37 shake_page(p, 0); 38 shake_page(p, 0);
38 /* 39 /*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e126b0ef9ad2..31f01c5011e5 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -753,7 +753,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
753 } 753 }
754 754
755 spin_lock_irqsave(&object->lock, flags); 755 spin_lock_irqsave(&object->lock, flags);
756 if (ptr + size > object->pointer + object->size) { 756 if (size == SIZE_MAX) {
757 size = object->pointer + object->size - ptr;
758 } else if (ptr + size > object->pointer + object->size) {
757 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); 759 kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
758 dump_object_info(object); 760 dump_object_info(object);
759 kmem_cache_free(scan_area_cache, area); 761 kmem_cache_free(scan_area_cache, area);
diff --git a/mm/ksm.c b/mm/ksm.c
index 0bea2b262a47..175fff79dc95 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2309,8 +2309,8 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj,
2309 * Allocate stable and unstable together: 2309 * Allocate stable and unstable together:
2310 * MAXSMP NODES_SHIFT 10 will use 16kB. 2310 * MAXSMP NODES_SHIFT 10 will use 16kB.
2311 */ 2311 */
2312 buf = kcalloc(nr_node_ids + nr_node_ids, 2312 buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
2313 sizeof(*buf), GFP_KERNEL | __GFP_ZERO); 2313 GFP_KERNEL);
2314 /* Let us assume that RB_ROOT is NULL is zero */ 2314 /* Let us assume that RB_ROOT is NULL is zero */
2315 if (!buf) 2315 if (!buf)
2316 err = -ENOMEM; 2316 err = -ENOMEM;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 72467914b856..72f9decb0104 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -81,8 +81,9 @@ restart:
81 * decrement nr_to_walk first so that we don't livelock if we 81 * decrement nr_to_walk first so that we don't livelock if we
82 * get stuck on large numbesr of LRU_RETRY items 82 * get stuck on large numbesr of LRU_RETRY items
83 */ 83 */
84 if (--(*nr_to_walk) == 0) 84 if (!*nr_to_walk)
85 break; 85 break;
86 --*nr_to_walk;
86 87
87 ret = isolate(item, &nlru->lock, cb_arg); 88 ret = isolate(item, &nlru->lock, cb_arg);
88 switch (ret) { 89 switch (ret) {
diff --git a/mm/madvise.c b/mm/madvise.c
index 6975bc812542..539eeb96b323 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -343,10 +343,11 @@ static long madvise_remove(struct vm_area_struct *vma,
343 */ 343 */
344static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) 344static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
345{ 345{
346 struct page *p;
346 if (!capable(CAP_SYS_ADMIN)) 347 if (!capable(CAP_SYS_ADMIN))
347 return -EPERM; 348 return -EPERM;
348 for (; start < end; start += PAGE_SIZE) { 349 for (; start < end; start += PAGE_SIZE <<
349 struct page *p; 350 compound_order(compound_head(p))) {
350 int ret; 351 int ret;
351 352
352 ret = get_user_pages_fast(start, 1, 0, &p); 353 ret = get_user_pages_fast(start, 1, 0, &p);
diff --git a/mm/memblock.c b/mm/memblock.c
index 0ac412a0a7ee..53e477bb5558 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,6 +20,8 @@
20#include <linux/seq_file.h> 20#include <linux/seq_file.h>
21#include <linux/memblock.h> 21#include <linux/memblock.h>
22 22
23#include <asm-generic/sections.h>
24
23static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 25static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
24static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; 26static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
25 27
@@ -32,6 +34,7 @@ struct memblock memblock __initdata_memblock = {
32 .reserved.cnt = 1, /* empty dummy entry */ 34 .reserved.cnt = 1, /* empty dummy entry */
33 .reserved.max = INIT_MEMBLOCK_REGIONS, 35 .reserved.max = INIT_MEMBLOCK_REGIONS,
34 36
37 .bottom_up = false,
35 .current_limit = MEMBLOCK_ALLOC_ANYWHERE, 38 .current_limit = MEMBLOCK_ALLOC_ANYWHERE,
36}; 39};
37 40
@@ -82,6 +85,73 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
82 return (i < type->cnt) ? i : -1; 85 return (i < type->cnt) ? i : -1;
83} 86}
84 87
88/*
89 * __memblock_find_range_bottom_up - find free area utility in bottom-up
90 * @start: start of candidate range
91 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
92 * @size: size of free area to find
93 * @align: alignment of free area to find
94 * @nid: nid of the free area to find, %MAX_NUMNODES for any node
95 *
96 * Utility called from memblock_find_in_range_node(), find free area bottom-up.
97 *
98 * RETURNS:
99 * Found address on success, 0 on failure.
100 */
101static phys_addr_t __init_memblock
102__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
103 phys_addr_t size, phys_addr_t align, int nid)
104{
105 phys_addr_t this_start, this_end, cand;
106 u64 i;
107
108 for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
109 this_start = clamp(this_start, start, end);
110 this_end = clamp(this_end, start, end);
111
112 cand = round_up(this_start, align);
113 if (cand < this_end && this_end - cand >= size)
114 return cand;
115 }
116
117 return 0;
118}
119
120/**
121 * __memblock_find_range_top_down - find free area utility, in top-down
122 * @start: start of candidate range
123 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
124 * @size: size of free area to find
125 * @align: alignment of free area to find
126 * @nid: nid of the free area to find, %MAX_NUMNODES for any node
127 *
128 * Utility called from memblock_find_in_range_node(), find free area top-down.
129 *
130 * RETURNS:
131 * Found address on success, 0 on failure.
132 */
133static phys_addr_t __init_memblock
134__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
135 phys_addr_t size, phys_addr_t align, int nid)
136{
137 phys_addr_t this_start, this_end, cand;
138 u64 i;
139
140 for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
141 this_start = clamp(this_start, start, end);
142 this_end = clamp(this_end, start, end);
143
144 if (this_end < size)
145 continue;
146
147 cand = round_down(this_end - size, align);
148 if (cand >= this_start)
149 return cand;
150 }
151
152 return 0;
153}
154
85/** 155/**
86 * memblock_find_in_range_node - find free area in given range and node 156 * memblock_find_in_range_node - find free area in given range and node
87 * @start: start of candidate range 157 * @start: start of candidate range
@@ -92,15 +162,23 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
92 * 162 *
93 * Find @size free area aligned to @align in the specified range and node. 163 * Find @size free area aligned to @align in the specified range and node.
94 * 164 *
165 * When allocation direction is bottom-up, the @start should be greater
166 * than the end of the kernel image. Otherwise, it will be trimmed. The
167 * reason is that we want the bottom-up allocation just near the kernel
168 * image so it is highly likely that the allocated memory and the kernel
169 * will reside in the same node.
170 *
171 * If bottom-up allocation failed, will try to allocate memory top-down.
172 *
95 * RETURNS: 173 * RETURNS:
96 * Found address on success, %0 on failure. 174 * Found address on success, 0 on failure.
97 */ 175 */
98phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, 176phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
99 phys_addr_t end, phys_addr_t size, 177 phys_addr_t end, phys_addr_t size,
100 phys_addr_t align, int nid) 178 phys_addr_t align, int nid)
101{ 179{
102 phys_addr_t this_start, this_end, cand; 180 int ret;
103 u64 i; 181 phys_addr_t kernel_end;
104 182
105 /* pump up @end */ 183 /* pump up @end */
106 if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 184 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
@@ -109,19 +187,39 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
109 /* avoid allocating the first page */ 187 /* avoid allocating the first page */
110 start = max_t(phys_addr_t, start, PAGE_SIZE); 188 start = max_t(phys_addr_t, start, PAGE_SIZE);
111 end = max(start, end); 189 end = max(start, end);
190 kernel_end = __pa_symbol(_end);
112 191
113 for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { 192 /*
114 this_start = clamp(this_start, start, end); 193 * try bottom-up allocation only when bottom-up mode
115 this_end = clamp(this_end, start, end); 194 * is set and @end is above the kernel image.
195 */
196 if (memblock_bottom_up() && end > kernel_end) {
197 phys_addr_t bottom_up_start;
116 198
117 if (this_end < size) 199 /* make sure we will allocate above the kernel */
118 continue; 200 bottom_up_start = max(start, kernel_end);
119 201
120 cand = round_down(this_end - size, align); 202 /* ok, try bottom-up allocation first */
121 if (cand >= this_start) 203 ret = __memblock_find_range_bottom_up(bottom_up_start, end,
122 return cand; 204 size, align, nid);
205 if (ret)
206 return ret;
207
208 /*
209 * we always limit bottom-up allocation above the kernel,
210 * but top-down allocation doesn't have the limit, so
211 * retrying top-down allocation may succeed when bottom-up
212 * allocation failed.
213 *
214 * bottom-up allocation is expected to be fail very rarely,
215 * so we use WARN_ONCE() here to see the stack trace if
216 * fail happens.
217 */
218 WARN_ONCE(1, "memblock: bottom-up allocation failed, "
219 "memory hotunplug may be affected\n");
123 } 220 }
124 return 0; 221
222 return __memblock_find_range_top_down(start, end, size, align, nid);
125} 223}
126 224
127/** 225/**
@@ -134,7 +232,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
134 * Find @size free area aligned to @align in the specified range. 232 * Find @size free area aligned to @align in the specified range.
135 * 233 *
136 * RETURNS: 234 * RETURNS:
137 * Found address on success, %0 on failure. 235 * Found address on success, 0 on failure.
138 */ 236 */
139phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, 237phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
140 phys_addr_t end, phys_addr_t size, 238 phys_addr_t end, phys_addr_t size,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5ff3ce13029..f1a0ae6e11b8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -39,6 +39,7 @@
39#include <linux/limits.h> 39#include <linux/limits.h>
40#include <linux/export.h> 40#include <linux/export.h>
41#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/rbtree.h>
42#include <linux/slab.h> 43#include <linux/slab.h>
43#include <linux/swap.h> 44#include <linux/swap.h>
44#include <linux/swapops.h> 45#include <linux/swapops.h>
@@ -53,10 +54,12 @@
53#include <linux/page_cgroup.h> 54#include <linux/page_cgroup.h>
54#include <linux/cpu.h> 55#include <linux/cpu.h>
55#include <linux/oom.h> 56#include <linux/oom.h>
57#include <linux/lockdep.h>
56#include "internal.h" 58#include "internal.h"
57#include <net/sock.h> 59#include <net/sock.h>
58#include <net/ip.h> 60#include <net/ip.h>
59#include <net/tcp_memcontrol.h> 61#include <net/tcp_memcontrol.h>
62#include "slab.h"
60 63
61#include <asm/uaccess.h> 64#include <asm/uaccess.h>
62 65
@@ -160,6 +163,10 @@ struct mem_cgroup_per_zone {
160 163
161 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 164 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
162 165
166 struct rb_node tree_node; /* RB tree node */
167 unsigned long long usage_in_excess;/* Set to the value by which */
168 /* the soft limit is exceeded*/
169 bool on_tree;
163 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 170 struct mem_cgroup *memcg; /* Back pointer, we cannot */
164 /* use container_of */ 171 /* use container_of */
165}; 172};
@@ -168,6 +175,26 @@ struct mem_cgroup_per_node {
168 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 175 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
169}; 176};
170 177
178/*
179 * Cgroups above their limits are maintained in a RB-Tree, independent of
180 * their hierarchy representation
181 */
182
183struct mem_cgroup_tree_per_zone {
184 struct rb_root rb_root;
185 spinlock_t lock;
186};
187
188struct mem_cgroup_tree_per_node {
189 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
190};
191
192struct mem_cgroup_tree {
193 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
194};
195
196static struct mem_cgroup_tree soft_limit_tree __read_mostly;
197
171struct mem_cgroup_threshold { 198struct mem_cgroup_threshold {
172 struct eventfd_ctx *eventfd; 199 struct eventfd_ctx *eventfd;
173 u64 threshold; 200 u64 threshold;
@@ -286,7 +313,7 @@ struct mem_cgroup {
286 313
287 atomic_t dead_count; 314 atomic_t dead_count;
288#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 315#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
289 struct tcp_memcontrol tcp_mem; 316 struct cg_proto tcp_mem;
290#endif 317#endif
291#if defined(CONFIG_MEMCG_KMEM) 318#if defined(CONFIG_MEMCG_KMEM)
292 /* analogous to slab_common's slab_caches list. per-memcg */ 319 /* analogous to slab_common's slab_caches list. per-memcg */
@@ -303,22 +330,6 @@ struct mem_cgroup {
303 atomic_t numainfo_events; 330 atomic_t numainfo_events;
304 atomic_t numainfo_updating; 331 atomic_t numainfo_updating;
305#endif 332#endif
306 /*
307 * Protects soft_contributed transitions.
308 * See mem_cgroup_update_soft_limit
309 */
310 spinlock_t soft_lock;
311
312 /*
313 * If true then this group has increased parents' children_in_excess
314 * when it got over the soft limit.
315 * When a group falls bellow the soft limit, parents' children_in_excess
316 * is decreased and soft_contributed changed to false.
317 */
318 bool soft_contributed;
319
320 /* Number of children that are in soft limit excess */
321 atomic_t children_in_excess;
322 333
323 struct mem_cgroup_per_node *nodeinfo[0]; 334 struct mem_cgroup_per_node *nodeinfo[0];
324 /* WARNING: nodeinfo must be the last member here */ 335 /* WARNING: nodeinfo must be the last member here */
@@ -422,6 +433,7 @@ static bool move_file(void)
422 * limit reclaim to prevent infinite loops, if they ever occur. 433 * limit reclaim to prevent infinite loops, if they ever occur.
423 */ 434 */
424#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 435#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
436#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
425 437
426enum charge_type { 438enum charge_type {
427 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 439 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -488,6 +500,29 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
488 return (memcg == root_mem_cgroup); 500 return (memcg == root_mem_cgroup);
489} 501}
490 502
503/*
504 * We restrict the id in the range of [1, 65535], so it can fit into
505 * an unsigned short.
506 */
507#define MEM_CGROUP_ID_MAX USHRT_MAX
508
509static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
510{
511 /*
512 * The ID of the root cgroup is 0, but memcg treat 0 as an
513 * invalid ID, so we return (cgroup_id + 1).
514 */
515 return memcg->css.cgroup->id + 1;
516}
517
518static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
519{
520 struct cgroup_subsys_state *css;
521
522 css = css_from_id(id - 1, &mem_cgroup_subsys);
523 return mem_cgroup_from_css(css);
524}
525
491/* Writing them here to avoid exposing memcg's inner layout */ 526/* Writing them here to avoid exposing memcg's inner layout */
492#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 527#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
493 528
@@ -540,13 +575,13 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
540 if (!memcg || mem_cgroup_is_root(memcg)) 575 if (!memcg || mem_cgroup_is_root(memcg))
541 return NULL; 576 return NULL;
542 577
543 return &memcg->tcp_mem.cg_proto; 578 return &memcg->tcp_mem;
544} 579}
545EXPORT_SYMBOL(tcp_proto_cgroup); 580EXPORT_SYMBOL(tcp_proto_cgroup);
546 581
547static void disarm_sock_keys(struct mem_cgroup *memcg) 582static void disarm_sock_keys(struct mem_cgroup *memcg)
548{ 583{
549 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) 584 if (!memcg_proto_activated(&memcg->tcp_mem))
550 return; 585 return;
551 static_key_slow_dec(&memcg_socket_limit_enabled); 586 static_key_slow_dec(&memcg_socket_limit_enabled);
552} 587}
@@ -559,16 +594,11 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
559#ifdef CONFIG_MEMCG_KMEM 594#ifdef CONFIG_MEMCG_KMEM
560/* 595/*
561 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 596 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
562 * There are two main reasons for not using the css_id for this: 597 * The main reason for not using cgroup id for this:
563 * 1) this works better in sparse environments, where we have a lot of memcgs, 598 * this works better in sparse environments, where we have a lot of memcgs,
564 * but only a few kmem-limited. Or also, if we have, for instance, 200 599 * but only a few kmem-limited. Or also, if we have, for instance, 200
565 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 600 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
566 * 200 entry array for that. 601 * 200 entry array for that.
567 *
568 * 2) In order not to violate the cgroup API, we would like to do all memory
569 * allocation in ->create(). At that point, we haven't yet allocated the
570 * css_id. Having a separate index prevents us from messing with the cgroup
571 * core for this
572 * 602 *
573 * The current size of the caches array is stored in 603 * The current size of the caches array is stored in
574 * memcg_limited_groups_array_size. It will double each time we have to 604 * memcg_limited_groups_array_size. It will double each time we have to
@@ -583,14 +613,14 @@ int memcg_limited_groups_array_size;
583 * cgroups is a reasonable guess. In the future, it could be a parameter or 613 * cgroups is a reasonable guess. In the future, it could be a parameter or
584 * tunable, but that is strictly not necessary. 614 * tunable, but that is strictly not necessary.
585 * 615 *
586 * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get 616 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
587 * this constant directly from cgroup, but it is understandable that this is 617 * this constant directly from cgroup, but it is understandable that this is
588 * better kept as an internal representation in cgroup.c. In any case, the 618 * better kept as an internal representation in cgroup.c. In any case, the
589 * css_id space is not getting any smaller, and we don't have to necessarily 619 * cgrp_id space is not getting any smaller, and we don't have to necessarily
590 * increase ours as well if it increases. 620 * increase ours as well if it increases.
591 */ 621 */
592#define MEMCG_CACHES_MIN_SIZE 4 622#define MEMCG_CACHES_MIN_SIZE 4
593#define MEMCG_CACHES_MAX_SIZE 65535 623#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
594 624
595/* 625/*
596 * A lot of the calls to the cache allocation functions are expected to be 626 * A lot of the calls to the cache allocation functions are expected to be
@@ -648,6 +678,164 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
648 return mem_cgroup_zoneinfo(memcg, nid, zid); 678 return mem_cgroup_zoneinfo(memcg, nid, zid);
649} 679}
650 680
681static struct mem_cgroup_tree_per_zone *
682soft_limit_tree_node_zone(int nid, int zid)
683{
684 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
685}
686
687static struct mem_cgroup_tree_per_zone *
688soft_limit_tree_from_page(struct page *page)
689{
690 int nid = page_to_nid(page);
691 int zid = page_zonenum(page);
692
693 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
694}
695
696static void
697__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
698 struct mem_cgroup_per_zone *mz,
699 struct mem_cgroup_tree_per_zone *mctz,
700 unsigned long long new_usage_in_excess)
701{
702 struct rb_node **p = &mctz->rb_root.rb_node;
703 struct rb_node *parent = NULL;
704 struct mem_cgroup_per_zone *mz_node;
705
706 if (mz->on_tree)
707 return;
708
709 mz->usage_in_excess = new_usage_in_excess;
710 if (!mz->usage_in_excess)
711 return;
712 while (*p) {
713 parent = *p;
714 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
715 tree_node);
716 if (mz->usage_in_excess < mz_node->usage_in_excess)
717 p = &(*p)->rb_left;
718 /*
719 * We can't avoid mem cgroups that are over their soft
720 * limit by the same amount
721 */
722 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
723 p = &(*p)->rb_right;
724 }
725 rb_link_node(&mz->tree_node, parent, p);
726 rb_insert_color(&mz->tree_node, &mctz->rb_root);
727 mz->on_tree = true;
728}
729
730static void
731__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
732 struct mem_cgroup_per_zone *mz,
733 struct mem_cgroup_tree_per_zone *mctz)
734{
735 if (!mz->on_tree)
736 return;
737 rb_erase(&mz->tree_node, &mctz->rb_root);
738 mz->on_tree = false;
739}
740
741static void
742mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
743 struct mem_cgroup_per_zone *mz,
744 struct mem_cgroup_tree_per_zone *mctz)
745{
746 spin_lock(&mctz->lock);
747 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
748 spin_unlock(&mctz->lock);
749}
750
751
752static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
753{
754 unsigned long long excess;
755 struct mem_cgroup_per_zone *mz;
756 struct mem_cgroup_tree_per_zone *mctz;
757 int nid = page_to_nid(page);
758 int zid = page_zonenum(page);
759 mctz = soft_limit_tree_from_page(page);
760
761 /*
762 * Necessary to update all ancestors when hierarchy is used.
763 * because their event counter is not touched.
764 */
765 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
766 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
767 excess = res_counter_soft_limit_excess(&memcg->res);
768 /*
769 * We have to update the tree if mz is on RB-tree or
770 * mem is over its softlimit.
771 */
772 if (excess || mz->on_tree) {
773 spin_lock(&mctz->lock);
774 /* if on-tree, remove it */
775 if (mz->on_tree)
776 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
777 /*
778 * Insert again. mz->usage_in_excess will be updated.
779 * If excess is 0, no tree ops.
780 */
781 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
782 spin_unlock(&mctz->lock);
783 }
784 }
785}
786
787static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
788{
789 int node, zone;
790 struct mem_cgroup_per_zone *mz;
791 struct mem_cgroup_tree_per_zone *mctz;
792
793 for_each_node(node) {
794 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
795 mz = mem_cgroup_zoneinfo(memcg, node, zone);
796 mctz = soft_limit_tree_node_zone(node, zone);
797 mem_cgroup_remove_exceeded(memcg, mz, mctz);
798 }
799 }
800}
801
802static struct mem_cgroup_per_zone *
803__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
804{
805 struct rb_node *rightmost = NULL;
806 struct mem_cgroup_per_zone *mz;
807
808retry:
809 mz = NULL;
810 rightmost = rb_last(&mctz->rb_root);
811 if (!rightmost)
812 goto done; /* Nothing to reclaim from */
813
814 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
815 /*
816 * Remove the node now but someone else can add it back,
817 * we will to add it back at the end of reclaim to its correct
818 * position in the tree.
819 */
820 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
821 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
822 !css_tryget(&mz->memcg->css))
823 goto retry;
824done:
825 return mz;
826}
827
828static struct mem_cgroup_per_zone *
829mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
830{
831 struct mem_cgroup_per_zone *mz;
832
833 spin_lock(&mctz->lock);
834 mz = __mem_cgroup_largest_soft_limit_node(mctz);
835 spin_unlock(&mctz->lock);
836 return mz;
837}
838
651/* 839/*
652 * Implementation Note: reading percpu statistics for memcg. 840 * Implementation Note: reading percpu statistics for memcg.
653 * 841 *
@@ -698,6 +886,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
698 unsigned long val = 0; 886 unsigned long val = 0;
699 int cpu; 887 int cpu;
700 888
889 get_online_cpus();
701 for_each_online_cpu(cpu) 890 for_each_online_cpu(cpu)
702 val += per_cpu(memcg->stat->events[idx], cpu); 891 val += per_cpu(memcg->stat->events[idx], cpu);
703#ifdef CONFIG_HOTPLUG_CPU 892#ifdef CONFIG_HOTPLUG_CPU
@@ -705,6 +894,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
705 val += memcg->nocpu_base.events[idx]; 894 val += memcg->nocpu_base.events[idx];
706 spin_unlock(&memcg->pcp_counter_lock); 895 spin_unlock(&memcg->pcp_counter_lock);
707#endif 896#endif
897 put_online_cpus();
708 return val; 898 return val;
709} 899}
710 900
@@ -822,48 +1012,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
822} 1012}
823 1013
824/* 1014/*
825 * Called from rate-limited memcg_check_events when enough
826 * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
827 * that all the parents up the hierarchy will be notified that this group
828 * is in excess or that it is not in excess anymore. mmecg->soft_contributed
829 * makes the transition a single action whenever the state flips from one to
830 * the other.
831 */
832static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
833{
834 unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
835 struct mem_cgroup *parent = memcg;
836 int delta = 0;
837
838 spin_lock(&memcg->soft_lock);
839 if (excess) {
840 if (!memcg->soft_contributed) {
841 delta = 1;
842 memcg->soft_contributed = true;
843 }
844 } else {
845 if (memcg->soft_contributed) {
846 delta = -1;
847 memcg->soft_contributed = false;
848 }
849 }
850
851 /*
852 * Necessary to update all ancestors when hierarchy is used
853 * because their event counter is not touched.
854 * We track children even outside the hierarchy for the root
855 * cgroup because tree walk starting at root should visit
856 * all cgroups and we want to prevent from pointless tree
857 * walk if no children is below the limit.
858 */
859 while (delta && (parent = parent_mem_cgroup(parent)))
860 atomic_add(delta, &parent->children_in_excess);
861 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
862 atomic_add(delta, &root_mem_cgroup->children_in_excess);
863 spin_unlock(&memcg->soft_lock);
864}
865
866/*
867 * Check events in order. 1015 * Check events in order.
868 * 1016 *
869 */ 1017 */
@@ -886,7 +1034,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
886 1034
887 mem_cgroup_threshold(memcg); 1035 mem_cgroup_threshold(memcg);
888 if (unlikely(do_softlimit)) 1036 if (unlikely(do_softlimit))
889 mem_cgroup_update_soft_limit(memcg); 1037 mem_cgroup_update_tree(memcg, page);
890#if MAX_NUMNODES > 1 1038#if MAX_NUMNODES > 1
891 if (unlikely(do_numainfo)) 1039 if (unlikely(do_numainfo))
892 atomic_inc(&memcg->numainfo_events); 1040 atomic_inc(&memcg->numainfo_events);
@@ -929,15 +1077,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
929 return memcg; 1077 return memcg;
930} 1078}
931 1079
932static enum mem_cgroup_filter_t
933mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
934 mem_cgroup_iter_filter cond)
935{
936 if (!cond)
937 return VISIT;
938 return cond(memcg, root);
939}
940
941/* 1080/*
942 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1081 * Returns a next (in a pre-order walk) alive memcg (with elevated css
943 * ref. count) or NULL if the whole root's subtree has been visited. 1082 * ref. count) or NULL if the whole root's subtree has been visited.
@@ -945,7 +1084,7 @@ mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
945 * helper function to be used by mem_cgroup_iter 1084 * helper function to be used by mem_cgroup_iter
946 */ 1085 */
947static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1086static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
948 struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond) 1087 struct mem_cgroup *last_visited)
949{ 1088{
950 struct cgroup_subsys_state *prev_css, *next_css; 1089 struct cgroup_subsys_state *prev_css, *next_css;
951 1090
@@ -963,31 +1102,11 @@ skip_node:
963 if (next_css) { 1102 if (next_css) {
964 struct mem_cgroup *mem = mem_cgroup_from_css(next_css); 1103 struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
965 1104
966 switch (mem_cgroup_filter(mem, root, cond)) { 1105 if (css_tryget(&mem->css))
967 case SKIP: 1106 return mem;
1107 else {
968 prev_css = next_css; 1108 prev_css = next_css;
969 goto skip_node; 1109 goto skip_node;
970 case SKIP_TREE:
971 if (mem == root)
972 return NULL;
973 /*
974 * css_rightmost_descendant is not an optimal way to
975 * skip through a subtree (especially for imbalanced
976 * trees leaning to right) but that's what we have right
977 * now. More effective solution would be traversing
978 * right-up for first non-NULL without calling
979 * css_next_descendant_pre afterwards.
980 */
981 prev_css = css_rightmost_descendant(next_css);
982 goto skip_node;
983 case VISIT:
984 if (css_tryget(&mem->css))
985 return mem;
986 else {
987 prev_css = next_css;
988 goto skip_node;
989 }
990 break;
991 } 1110 }
992 } 1111 }
993 1112
@@ -1051,7 +1170,6 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1051 * @root: hierarchy root 1170 * @root: hierarchy root
1052 * @prev: previously returned memcg, NULL on first invocation 1171 * @prev: previously returned memcg, NULL on first invocation
1053 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1172 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1054 * @cond: filter for visited nodes, NULL for no filter
1055 * 1173 *
1056 * Returns references to children of the hierarchy below @root, or 1174 * Returns references to children of the hierarchy below @root, or
1057 * @root itself, or %NULL after a full round-trip. 1175 * @root itself, or %NULL after a full round-trip.
@@ -1064,18 +1182,15 @@ static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1064 * divide up the memcgs in the hierarchy among all concurrent 1182 * divide up the memcgs in the hierarchy among all concurrent
1065 * reclaimers operating on the same zone and priority. 1183 * reclaimers operating on the same zone and priority.
1066 */ 1184 */
1067struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root, 1185struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1068 struct mem_cgroup *prev, 1186 struct mem_cgroup *prev,
1069 struct mem_cgroup_reclaim_cookie *reclaim, 1187 struct mem_cgroup_reclaim_cookie *reclaim)
1070 mem_cgroup_iter_filter cond)
1071{ 1188{
1072 struct mem_cgroup *memcg = NULL; 1189 struct mem_cgroup *memcg = NULL;
1073 struct mem_cgroup *last_visited = NULL; 1190 struct mem_cgroup *last_visited = NULL;
1074 1191
1075 if (mem_cgroup_disabled()) { 1192 if (mem_cgroup_disabled())
1076 /* first call must return non-NULL, second return NULL */ 1193 return NULL;
1077 return (struct mem_cgroup *)(unsigned long)!prev;
1078 }
1079 1194
1080 if (!root) 1195 if (!root)
1081 root = root_mem_cgroup; 1196 root = root_mem_cgroup;
@@ -1086,9 +1201,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1086 if (!root->use_hierarchy && root != root_mem_cgroup) { 1201 if (!root->use_hierarchy && root != root_mem_cgroup) {
1087 if (prev) 1202 if (prev)
1088 goto out_css_put; 1203 goto out_css_put;
1089 if (mem_cgroup_filter(root, root, cond) == VISIT) 1204 return root;
1090 return root;
1091 return NULL;
1092 } 1205 }
1093 1206
1094 rcu_read_lock(); 1207 rcu_read_lock();
@@ -1111,7 +1224,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1111 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1224 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1112 } 1225 }
1113 1226
1114 memcg = __mem_cgroup_iter_next(root, last_visited, cond); 1227 memcg = __mem_cgroup_iter_next(root, last_visited);
1115 1228
1116 if (reclaim) { 1229 if (reclaim) {
1117 mem_cgroup_iter_update(iter, last_visited, memcg, seq); 1230 mem_cgroup_iter_update(iter, last_visited, memcg, seq);
@@ -1122,11 +1235,7 @@ struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1122 reclaim->generation = iter->generation; 1235 reclaim->generation = iter->generation;
1123 } 1236 }
1124 1237
1125 /* 1238 if (prev && !memcg)
1126 * We have finished the whole tree walk or no group has been
1127 * visited because filter told us to skip the root node.
1128 */
1129 if (!memcg && (prev || (cond && !last_visited)))
1130 goto out_unlock; 1239 goto out_unlock;
1131 } 1240 }
1132out_unlock: 1241out_unlock:
@@ -1318,7 +1427,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1318 return true; 1427 return true;
1319 if (!root_memcg->use_hierarchy || !memcg) 1428 if (!root_memcg->use_hierarchy || !memcg)
1320 return false; 1429 return false;
1321 return css_is_ancestor(&memcg->css, &root_memcg->css); 1430 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
1322} 1431}
1323 1432
1324static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1433static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
@@ -1767,7 +1876,6 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1767 return total; 1876 return total;
1768} 1877}
1769 1878
1770#if MAX_NUMNODES > 1
1771/** 1879/**
1772 * test_mem_cgroup_node_reclaimable 1880 * test_mem_cgroup_node_reclaimable
1773 * @memcg: the target memcg 1881 * @memcg: the target memcg
@@ -1790,6 +1898,7 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1790 return false; 1898 return false;
1791 1899
1792} 1900}
1901#if MAX_NUMNODES > 1
1793 1902
1794/* 1903/*
1795 * Always updating the nodemask is not very good - even if we have an empty 1904 * Always updating the nodemask is not very good - even if we have an empty
@@ -1857,52 +1966,112 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1857 return node; 1966 return node;
1858} 1967}
1859 1968
1969/*
1970 * Check all nodes whether it contains reclaimable pages or not.
1971 * For quick scan, we make use of scan_nodes. This will allow us to skip
1972 * unused nodes. But scan_nodes is lazily updated and may not cotain
1973 * enough new information. We need to do double check.
1974 */
1975static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1976{
1977 int nid;
1978
1979 /*
1980 * quick check...making use of scan_node.
1981 * We can skip unused nodes.
1982 */
1983 if (!nodes_empty(memcg->scan_nodes)) {
1984 for (nid = first_node(memcg->scan_nodes);
1985 nid < MAX_NUMNODES;
1986 nid = next_node(nid, memcg->scan_nodes)) {
1987
1988 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1989 return true;
1990 }
1991 }
1992 /*
1993 * Check rest of nodes.
1994 */
1995 for_each_node_state(nid, N_MEMORY) {
1996 if (node_isset(nid, memcg->scan_nodes))
1997 continue;
1998 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1999 return true;
2000 }
2001 return false;
2002}
2003
1860#else 2004#else
1861int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 2005int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1862{ 2006{
1863 return 0; 2007 return 0;
1864} 2008}
1865 2009
1866#endif 2010static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1867
1868/*
1869 * A group is eligible for the soft limit reclaim under the given root
1870 * hierarchy if
1871 * a) it is over its soft limit
1872 * b) any parent up the hierarchy is over its soft limit
1873 *
1874 * If the given group doesn't have any children over the limit then it
1875 * doesn't make any sense to iterate its subtree.
1876 */
1877enum mem_cgroup_filter_t
1878mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
1879 struct mem_cgroup *root)
1880{ 2011{
1881 struct mem_cgroup *parent; 2012 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1882 2013}
1883 if (!memcg) 2014#endif
1884 memcg = root_mem_cgroup;
1885 parent = memcg;
1886
1887 if (res_counter_soft_limit_excess(&memcg->res))
1888 return VISIT;
1889 2015
1890 /* 2016static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1891 * If any parent up to the root in the hierarchy is over its soft limit 2017 struct zone *zone,
1892 * then we have to obey and reclaim from this group as well. 2018 gfp_t gfp_mask,
1893 */ 2019 unsigned long *total_scanned)
1894 while ((parent = parent_mem_cgroup(parent))) { 2020{
1895 if (res_counter_soft_limit_excess(&parent->res)) 2021 struct mem_cgroup *victim = NULL;
1896 return VISIT; 2022 int total = 0;
1897 if (parent == root) 2023 int loop = 0;
2024 unsigned long excess;
2025 unsigned long nr_scanned;
2026 struct mem_cgroup_reclaim_cookie reclaim = {
2027 .zone = zone,
2028 .priority = 0,
2029 };
2030
2031 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
2032
2033 while (1) {
2034 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
2035 if (!victim) {
2036 loop++;
2037 if (loop >= 2) {
2038 /*
2039 * If we have not been able to reclaim
2040 * anything, it might because there are
2041 * no reclaimable pages under this hierarchy
2042 */
2043 if (!total)
2044 break;
2045 /*
2046 * We want to do more targeted reclaim.
2047 * excess >> 2 is not to excessive so as to
2048 * reclaim too much, nor too less that we keep
2049 * coming back to reclaim from this cgroup
2050 */
2051 if (total >= (excess >> 2) ||
2052 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2053 break;
2054 }
2055 continue;
2056 }
2057 if (!mem_cgroup_reclaimable(victim, false))
2058 continue;
2059 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2060 zone, &nr_scanned);
2061 *total_scanned += nr_scanned;
2062 if (!res_counter_soft_limit_excess(&root_memcg->res))
1898 break; 2063 break;
1899 } 2064 }
1900 2065 mem_cgroup_iter_break(root_memcg, victim);
1901 if (!atomic_read(&memcg->children_in_excess)) 2066 return total;
1902 return SKIP_TREE;
1903 return SKIP;
1904} 2067}
1905 2068
2069#ifdef CONFIG_LOCKDEP
2070static struct lockdep_map memcg_oom_lock_dep_map = {
2071 .name = "memcg_oom_lock",
2072};
2073#endif
2074
1906static DEFINE_SPINLOCK(memcg_oom_lock); 2075static DEFINE_SPINLOCK(memcg_oom_lock);
1907 2076
1908/* 2077/*
@@ -1940,7 +2109,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1940 } 2109 }
1941 iter->oom_lock = false; 2110 iter->oom_lock = false;
1942 } 2111 }
1943 } 2112 } else
2113 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1944 2114
1945 spin_unlock(&memcg_oom_lock); 2115 spin_unlock(&memcg_oom_lock);
1946 2116
@@ -1952,6 +2122,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1952 struct mem_cgroup *iter; 2122 struct mem_cgroup *iter;
1953 2123
1954 spin_lock(&memcg_oom_lock); 2124 spin_lock(&memcg_oom_lock);
2125 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1955 for_each_mem_cgroup_tree(iter, memcg) 2126 for_each_mem_cgroup_tree(iter, memcg)
1956 iter->oom_lock = false; 2127 iter->oom_lock = false;
1957 spin_unlock(&memcg_oom_lock); 2128 spin_unlock(&memcg_oom_lock);
@@ -2018,110 +2189,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
2018 memcg_wakeup_oom(memcg); 2189 memcg_wakeup_oom(memcg);
2019} 2190}
2020 2191
2021/*
2022 * try to call OOM killer
2023 */
2024static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2192static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2025{ 2193{
2026 bool locked;
2027 int wakeups;
2028
2029 if (!current->memcg_oom.may_oom) 2194 if (!current->memcg_oom.may_oom)
2030 return; 2195 return;
2031
2032 current->memcg_oom.in_memcg_oom = 1;
2033
2034 /* 2196 /*
2035 * As with any blocking lock, a contender needs to start 2197 * We are in the middle of the charge context here, so we
2036 * listening for wakeups before attempting the trylock, 2198 * don't want to block when potentially sitting on a callstack
2037 * otherwise it can miss the wakeup from the unlock and sleep 2199 * that holds all kinds of filesystem and mm locks.
2038 * indefinitely. This is just open-coded because our locking 2200 *
2039 * is so particular to memcg hierarchies. 2201 * Also, the caller may handle a failed allocation gracefully
2202 * (like optional page cache readahead) and so an OOM killer
2203 * invocation might not even be necessary.
2204 *
2205 * That's why we don't do anything here except remember the
2206 * OOM context and then deal with it at the end of the page
2207 * fault when the stack is unwound, the locks are released,
2208 * and when we know whether the fault was overall successful.
2040 */ 2209 */
2041 wakeups = atomic_read(&memcg->oom_wakeups); 2210 css_get(&memcg->css);
2042 mem_cgroup_mark_under_oom(memcg); 2211 current->memcg_oom.memcg = memcg;
2043 2212 current->memcg_oom.gfp_mask = mask;
2044 locked = mem_cgroup_oom_trylock(memcg); 2213 current->memcg_oom.order = order;
2045
2046 if (locked)
2047 mem_cgroup_oom_notify(memcg);
2048
2049 if (locked && !memcg->oom_kill_disable) {
2050 mem_cgroup_unmark_under_oom(memcg);
2051 mem_cgroup_out_of_memory(memcg, mask, order);
2052 mem_cgroup_oom_unlock(memcg);
2053 /*
2054 * There is no guarantee that an OOM-lock contender
2055 * sees the wakeups triggered by the OOM kill
2056 * uncharges. Wake any sleepers explicitely.
2057 */
2058 memcg_oom_recover(memcg);
2059 } else {
2060 /*
2061 * A system call can just return -ENOMEM, but if this
2062 * is a page fault and somebody else is handling the
2063 * OOM already, we need to sleep on the OOM waitqueue
2064 * for this memcg until the situation is resolved.
2065 * Which can take some time because it might be
2066 * handled by a userspace task.
2067 *
2068 * However, this is the charge context, which means
2069 * that we may sit on a large call stack and hold
2070 * various filesystem locks, the mmap_sem etc. and we
2071 * don't want the OOM handler to deadlock on them
2072 * while we sit here and wait. Store the current OOM
2073 * context in the task_struct, then return -ENOMEM.
2074 * At the end of the page fault handler, with the
2075 * stack unwound, pagefault_out_of_memory() will check
2076 * back with us by calling
2077 * mem_cgroup_oom_synchronize(), possibly putting the
2078 * task to sleep.
2079 */
2080 current->memcg_oom.oom_locked = locked;
2081 current->memcg_oom.wakeups = wakeups;
2082 css_get(&memcg->css);
2083 current->memcg_oom.wait_on_memcg = memcg;
2084 }
2085} 2214}
2086 2215
2087/** 2216/**
2088 * mem_cgroup_oom_synchronize - complete memcg OOM handling 2217 * mem_cgroup_oom_synchronize - complete memcg OOM handling
2218 * @handle: actually kill/wait or just clean up the OOM state
2089 * 2219 *
2090 * This has to be called at the end of a page fault if the the memcg 2220 * This has to be called at the end of a page fault if the memcg OOM
2091 * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. 2221 * handler was enabled.
2092 * 2222 *
2093 * Memcg supports userspace OOM handling, so failed allocations must 2223 * Memcg supports userspace OOM handling where failed allocations must
2094 * sleep on a waitqueue until the userspace task resolves the 2224 * sleep on a waitqueue until the userspace task resolves the
2095 * situation. Sleeping directly in the charge context with all kinds 2225 * situation. Sleeping directly in the charge context with all kinds
2096 * of locks held is not a good idea, instead we remember an OOM state 2226 * of locks held is not a good idea, instead we remember an OOM state
2097 * in the task and mem_cgroup_oom_synchronize() has to be called at 2227 * in the task and mem_cgroup_oom_synchronize() has to be called at
2098 * the end of the page fault to put the task to sleep and clean up the 2228 * the end of the page fault to complete the OOM handling.
2099 * OOM state.
2100 * 2229 *
2101 * Returns %true if an ongoing memcg OOM situation was detected and 2230 * Returns %true if an ongoing memcg OOM situation was detected and
2102 * finalized, %false otherwise. 2231 * completed, %false otherwise.
2103 */ 2232 */
2104bool mem_cgroup_oom_synchronize(void) 2233bool mem_cgroup_oom_synchronize(bool handle)
2105{ 2234{
2235 struct mem_cgroup *memcg = current->memcg_oom.memcg;
2106 struct oom_wait_info owait; 2236 struct oom_wait_info owait;
2107 struct mem_cgroup *memcg; 2237 bool locked;
2108 2238
2109 /* OOM is global, do not handle */ 2239 /* OOM is global, do not handle */
2110 if (!current->memcg_oom.in_memcg_oom)
2111 return false;
2112
2113 /*
2114 * We invoked the OOM killer but there is a chance that a kill
2115 * did not free up any charges. Everybody else might already
2116 * be sleeping, so restart the fault and keep the rampage
2117 * going until some charges are released.
2118 */
2119 memcg = current->memcg_oom.wait_on_memcg;
2120 if (!memcg) 2240 if (!memcg)
2121 goto out; 2241 return false;
2122 2242
2123 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2243 if (!handle)
2124 goto out_memcg; 2244 goto cleanup;
2125 2245
2126 owait.memcg = memcg; 2246 owait.memcg = memcg;
2127 owait.wait.flags = 0; 2247 owait.wait.flags = 0;
@@ -2130,13 +2250,25 @@ bool mem_cgroup_oom_synchronize(void)
2130 INIT_LIST_HEAD(&owait.wait.task_list); 2250 INIT_LIST_HEAD(&owait.wait.task_list);
2131 2251
2132 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2252 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2133 /* Only sleep if we didn't miss any wakeups since OOM */ 2253 mem_cgroup_mark_under_oom(memcg);
2134 if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) 2254
2255 locked = mem_cgroup_oom_trylock(memcg);
2256
2257 if (locked)
2258 mem_cgroup_oom_notify(memcg);
2259
2260 if (locked && !memcg->oom_kill_disable) {
2261 mem_cgroup_unmark_under_oom(memcg);
2262 finish_wait(&memcg_oom_waitq, &owait.wait);
2263 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
2264 current->memcg_oom.order);
2265 } else {
2135 schedule(); 2266 schedule();
2136 finish_wait(&memcg_oom_waitq, &owait.wait); 2267 mem_cgroup_unmark_under_oom(memcg);
2137out_memcg: 2268 finish_wait(&memcg_oom_waitq, &owait.wait);
2138 mem_cgroup_unmark_under_oom(memcg); 2269 }
2139 if (current->memcg_oom.oom_locked) { 2270
2271 if (locked) {
2140 mem_cgroup_oom_unlock(memcg); 2272 mem_cgroup_oom_unlock(memcg);
2141 /* 2273 /*
2142 * There is no guarantee that an OOM-lock contender 2274 * There is no guarantee that an OOM-lock contender
@@ -2145,10 +2277,9 @@ out_memcg:
2145 */ 2277 */
2146 memcg_oom_recover(memcg); 2278 memcg_oom_recover(memcg);
2147 } 2279 }
2280cleanup:
2281 current->memcg_oom.memcg = NULL;
2148 css_put(&memcg->css); 2282 css_put(&memcg->css);
2149 current->memcg_oom.wait_on_memcg = NULL;
2150out:
2151 current->memcg_oom.in_memcg_oom = 0;
2152 return true; 2283 return true;
2153} 2284}
2154 2285
@@ -2562,6 +2693,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2562 || fatal_signal_pending(current))) 2693 || fatal_signal_pending(current)))
2563 goto bypass; 2694 goto bypass;
2564 2695
2696 if (unlikely(task_in_memcg_oom(current)))
2697 goto bypass;
2698
2565 /* 2699 /*
2566 * We always charge the cgroup the mm_struct belongs to. 2700 * We always charge the cgroup the mm_struct belongs to.
2567 * The mm_struct's mem_cgroup changes on task migration if the 2701 * The mm_struct's mem_cgroup changes on task migration if the
@@ -2659,8 +2793,10 @@ done:
2659 *ptr = memcg; 2793 *ptr = memcg;
2660 return 0; 2794 return 0;
2661nomem: 2795nomem:
2662 *ptr = NULL; 2796 if (!(gfp_mask & __GFP_NOFAIL)) {
2663 return -ENOMEM; 2797 *ptr = NULL;
2798 return -ENOMEM;
2799 }
2664bypass: 2800bypass:
2665 *ptr = root_mem_cgroup; 2801 *ptr = root_mem_cgroup;
2666 return -EINTR; 2802 return -EINTR;
@@ -2709,15 +2845,10 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2709 */ 2845 */
2710static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2846static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2711{ 2847{
2712 struct cgroup_subsys_state *css;
2713
2714 /* ID 0 is unused ID */ 2848 /* ID 0 is unused ID */
2715 if (!id) 2849 if (!id)
2716 return NULL; 2850 return NULL;
2717 css = css_lookup(&mem_cgroup_subsys, id); 2851 return mem_cgroup_from_id(id);
2718 if (!css)
2719 return NULL;
2720 return mem_cgroup_from_css(css);
2721} 2852}
2722 2853
2723struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2854struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2812,7 +2943,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2812 unlock_page_cgroup(pc); 2943 unlock_page_cgroup(pc);
2813 2944
2814 /* 2945 /*
2815 * "charge_statistics" updated event counter. 2946 * "charge_statistics" updated event counter. Then, check it.
2947 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2948 * if they exceeds softlimit.
2816 */ 2949 */
2817 memcg_check_events(memcg, page); 2950 memcg_check_events(memcg, page);
2818} 2951}
@@ -2836,7 +2969,7 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2836 2969
2837 VM_BUG_ON(p->is_root_cache); 2970 VM_BUG_ON(p->is_root_cache);
2838 cachep = p->root_cache; 2971 cachep = p->root_cache;
2839 return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; 2972 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
2840} 2973}
2841 2974
2842#ifdef CONFIG_SLABINFO 2975#ifdef CONFIG_SLABINFO
@@ -2865,21 +2998,14 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2865 struct res_counter *fail_res; 2998 struct res_counter *fail_res;
2866 struct mem_cgroup *_memcg; 2999 struct mem_cgroup *_memcg;
2867 int ret = 0; 3000 int ret = 0;
2868 bool may_oom;
2869 3001
2870 ret = res_counter_charge(&memcg->kmem, size, &fail_res); 3002 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2871 if (ret) 3003 if (ret)
2872 return ret; 3004 return ret;
2873 3005
2874 /*
2875 * Conditions under which we can wait for the oom_killer. Those are
2876 * the same conditions tested by the core page allocator
2877 */
2878 may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
2879
2880 _memcg = memcg; 3006 _memcg = memcg;
2881 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, 3007 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
2882 &_memcg, may_oom); 3008 &_memcg, oom_gfp_allowed(gfp));
2883 3009
2884 if (ret == -EINTR) { 3010 if (ret == -EINTR) {
2885 /* 3011 /*
@@ -3019,7 +3145,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3019{ 3145{
3020 struct memcg_cache_params *cur_params = s->memcg_params; 3146 struct memcg_cache_params *cur_params = s->memcg_params;
3021 3147
3022 VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); 3148 VM_BUG_ON(!is_root_cache(s));
3023 3149
3024 if (num_groups > memcg_limited_groups_array_size) { 3150 if (num_groups > memcg_limited_groups_array_size) {
3025 int i; 3151 int i;
@@ -3280,7 +3406,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3280 idx = memcg_cache_id(memcg); 3406 idx = memcg_cache_id(memcg);
3281 3407
3282 mutex_lock(&memcg_cache_mutex); 3408 mutex_lock(&memcg_cache_mutex);
3283 new_cachep = cachep->memcg_params->memcg_caches[idx]; 3409 new_cachep = cache_from_memcg_idx(cachep, idx);
3284 if (new_cachep) { 3410 if (new_cachep) {
3285 css_put(&memcg->css); 3411 css_put(&memcg->css);
3286 goto out; 3412 goto out;
@@ -3326,8 +3452,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3326 * we'll take the set_limit_mutex to protect ourselves against this. 3452 * we'll take the set_limit_mutex to protect ourselves against this.
3327 */ 3453 */
3328 mutex_lock(&set_limit_mutex); 3454 mutex_lock(&set_limit_mutex);
3329 for (i = 0; i < memcg_limited_groups_array_size; i++) { 3455 for_each_memcg_cache_index(i) {
3330 c = s->memcg_params->memcg_caches[i]; 3456 c = cache_from_memcg_idx(s, i);
3331 if (!c) 3457 if (!c)
3332 continue; 3458 continue;
3333 3459
@@ -3460,8 +3586,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3460 * code updating memcg_caches will issue a write barrier to match this. 3586 * code updating memcg_caches will issue a write barrier to match this.
3461 */ 3587 */
3462 read_barrier_depends(); 3588 read_barrier_depends();
3463 if (likely(cachep->memcg_params->memcg_caches[idx])) { 3589 if (likely(cache_from_memcg_idx(cachep, idx))) {
3464 cachep = cachep->memcg_params->memcg_caches[idx]; 3590 cachep = cache_from_memcg_idx(cachep, idx);
3465 goto out; 3591 goto out;
3466 } 3592 }
3467 3593
@@ -3663,8 +3789,7 @@ void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
3663{ 3789{
3664 /* Update stat data for mem_cgroup */ 3790 /* Update stat data for mem_cgroup */
3665 preempt_disable(); 3791 preempt_disable();
3666 WARN_ON_ONCE(from->stat->count[idx] < nr_pages); 3792 __this_cpu_sub(from->stat->count[idx], nr_pages);
3667 __this_cpu_add(from->stat->count[idx], -nr_pages);
3668 __this_cpu_add(to->stat->count[idx], nr_pages); 3793 __this_cpu_add(to->stat->count[idx], nr_pages);
3669 preempt_enable(); 3794 preempt_enable();
3670} 3795}
@@ -4232,7 +4357,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
4232 * css_get() was called in uncharge(). 4357 * css_get() was called in uncharge().
4233 */ 4358 */
4234 if (do_swap_account && swapout && memcg) 4359 if (do_swap_account && swapout && memcg)
4235 swap_cgroup_record(ent, css_id(&memcg->css)); 4360 swap_cgroup_record(ent, mem_cgroup_id(memcg));
4236} 4361}
4237#endif 4362#endif
4238 4363
@@ -4284,8 +4409,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
4284{ 4409{
4285 unsigned short old_id, new_id; 4410 unsigned short old_id, new_id;
4286 4411
4287 old_id = css_id(&from->css); 4412 old_id = mem_cgroup_id(from);
4288 new_id = css_id(&to->css); 4413 new_id = mem_cgroup_id(to);
4289 4414
4290 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 4415 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
4291 mem_cgroup_swap_statistics(from, false); 4416 mem_cgroup_swap_statistics(from, false);
@@ -4647,6 +4772,98 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4647 return ret; 4772 return ret;
4648} 4773}
4649 4774
4775unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4776 gfp_t gfp_mask,
4777 unsigned long *total_scanned)
4778{
4779 unsigned long nr_reclaimed = 0;
4780 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4781 unsigned long reclaimed;
4782 int loop = 0;
4783 struct mem_cgroup_tree_per_zone *mctz;
4784 unsigned long long excess;
4785 unsigned long nr_scanned;
4786
4787 if (order > 0)
4788 return 0;
4789
4790 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4791 /*
4792 * This loop can run a while, specially if mem_cgroup's continuously
4793 * keep exceeding their soft limit and putting the system under
4794 * pressure
4795 */
4796 do {
4797 if (next_mz)
4798 mz = next_mz;
4799 else
4800 mz = mem_cgroup_largest_soft_limit_node(mctz);
4801 if (!mz)
4802 break;
4803
4804 nr_scanned = 0;
4805 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4806 gfp_mask, &nr_scanned);
4807 nr_reclaimed += reclaimed;
4808 *total_scanned += nr_scanned;
4809 spin_lock(&mctz->lock);
4810
4811 /*
4812 * If we failed to reclaim anything from this memory cgroup
4813 * it is time to move on to the next cgroup
4814 */
4815 next_mz = NULL;
4816 if (!reclaimed) {
4817 do {
4818 /*
4819 * Loop until we find yet another one.
4820 *
4821 * By the time we get the soft_limit lock
4822 * again, someone might have aded the
4823 * group back on the RB tree. Iterate to
4824 * make sure we get a different mem.
4825 * mem_cgroup_largest_soft_limit_node returns
4826 * NULL if no other cgroup is present on
4827 * the tree
4828 */
4829 next_mz =
4830 __mem_cgroup_largest_soft_limit_node(mctz);
4831 if (next_mz == mz)
4832 css_put(&next_mz->memcg->css);
4833 else /* next_mz == NULL or other memcg */
4834 break;
4835 } while (1);
4836 }
4837 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4838 excess = res_counter_soft_limit_excess(&mz->memcg->res);
4839 /*
4840 * One school of thought says that we should not add
4841 * back the node to the tree if reclaim returns 0.
4842 * But our reclaim could return 0, simply because due
4843 * to priority we are exposing a smaller subset of
4844 * memory to reclaim from. Consider this as a longer
4845 * term TODO.
4846 */
4847 /* If excess == 0, no tree ops */
4848 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4849 spin_unlock(&mctz->lock);
4850 css_put(&mz->memcg->css);
4851 loop++;
4852 /*
4853 * Could not reclaim anything and there are no more
4854 * mem cgroups to try or we seem to be looping without
4855 * reclaiming anything.
4856 */
4857 if (!nr_reclaimed &&
4858 (next_mz == NULL ||
4859 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4860 break;
4861 } while (!nr_reclaimed);
4862 if (next_mz)
4863 css_put(&next_mz->memcg->css);
4864 return nr_reclaimed;
4865}
4866
4650/** 4867/**
4651 * mem_cgroup_force_empty_list - clears LRU of a group 4868 * mem_cgroup_force_empty_list - clears LRU of a group
4652 * @memcg: group to clear 4869 * @memcg: group to clear
@@ -4748,31 +4965,18 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4748 } while (usage > 0); 4965 } while (usage > 0);
4749} 4966}
4750 4967
4751/*
4752 * This mainly exists for tests during the setting of set of use_hierarchy.
4753 * Since this is the very setting we are changing, the current hierarchy value
4754 * is meaningless
4755 */
4756static inline bool __memcg_has_children(struct mem_cgroup *memcg)
4757{
4758 struct cgroup_subsys_state *pos;
4759
4760 /* bounce at first found */
4761 css_for_each_child(pos, &memcg->css)
4762 return true;
4763 return false;
4764}
4765
4766/*
4767 * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
4768 * to be already dead (as in mem_cgroup_force_empty, for instance). This is
4769 * from mem_cgroup_count_children(), in the sense that we don't really care how
4770 * many children we have; we only need to know if we have any. It also counts
4771 * any memcg without hierarchy as infertile.
4772 */
4773static inline bool memcg_has_children(struct mem_cgroup *memcg) 4968static inline bool memcg_has_children(struct mem_cgroup *memcg)
4774{ 4969{
4775 return memcg->use_hierarchy && __memcg_has_children(memcg); 4970 lockdep_assert_held(&memcg_create_mutex);
4971 /*
4972 * The lock does not prevent addition or deletion to the list
4973 * of children, but it prevents a new child from being
4974 * initialized based on this parent in css_online(), so it's
4975 * enough to decide whether hierarchically inherited
4976 * attributes can still be changed or not.
4977 */
4978 return memcg->use_hierarchy &&
4979 !list_empty(&memcg->css.cgroup->children);
4776} 4980}
4777 4981
4778/* 4982/*
@@ -4852,7 +5056,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
4852 */ 5056 */
4853 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 5057 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
4854 (val == 1 || val == 0)) { 5058 (val == 1 || val == 0)) {
4855 if (!__memcg_has_children(memcg)) 5059 if (list_empty(&memcg->css.cgroup->children))
4856 memcg->use_hierarchy = val; 5060 memcg->use_hierarchy = val;
4857 else 5061 else
4858 retval = -EBUSY; 5062 retval = -EBUSY;
@@ -5179,45 +5383,50 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5179static int memcg_numa_stat_show(struct cgroup_subsys_state *css, 5383static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
5180 struct cftype *cft, struct seq_file *m) 5384 struct cftype *cft, struct seq_file *m)
5181{ 5385{
5386 struct numa_stat {
5387 const char *name;
5388 unsigned int lru_mask;
5389 };
5390
5391 static const struct numa_stat stats[] = {
5392 { "total", LRU_ALL },
5393 { "file", LRU_ALL_FILE },
5394 { "anon", LRU_ALL_ANON },
5395 { "unevictable", BIT(LRU_UNEVICTABLE) },
5396 };
5397 const struct numa_stat *stat;
5182 int nid; 5398 int nid;
5183 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 5399 unsigned long nr;
5184 unsigned long node_nr;
5185 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5400 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5186 5401
5187 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 5402 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5188 seq_printf(m, "total=%lu", total_nr); 5403 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
5189 for_each_node_state(nid, N_MEMORY) { 5404 seq_printf(m, "%s=%lu", stat->name, nr);
5190 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); 5405 for_each_node_state(nid, N_MEMORY) {
5191 seq_printf(m, " N%d=%lu", nid, node_nr); 5406 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5192 } 5407 stat->lru_mask);
5193 seq_putc(m, '\n'); 5408 seq_printf(m, " N%d=%lu", nid, nr);
5194 5409 }
5195 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); 5410 seq_putc(m, '\n');
5196 seq_printf(m, "file=%lu", file_nr);
5197 for_each_node_state(nid, N_MEMORY) {
5198 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5199 LRU_ALL_FILE);
5200 seq_printf(m, " N%d=%lu", nid, node_nr);
5201 } 5411 }
5202 seq_putc(m, '\n');
5203 5412
5204 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); 5413 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5205 seq_printf(m, "anon=%lu", anon_nr); 5414 struct mem_cgroup *iter;
5206 for_each_node_state(nid, N_MEMORY) { 5415
5207 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5416 nr = 0;
5208 LRU_ALL_ANON); 5417 for_each_mem_cgroup_tree(iter, memcg)
5209 seq_printf(m, " N%d=%lu", nid, node_nr); 5418 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
5419 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
5420 for_each_node_state(nid, N_MEMORY) {
5421 nr = 0;
5422 for_each_mem_cgroup_tree(iter, memcg)
5423 nr += mem_cgroup_node_nr_lru_pages(
5424 iter, nid, stat->lru_mask);
5425 seq_printf(m, " N%d=%lu", nid, nr);
5426 }
5427 seq_putc(m, '\n');
5210 } 5428 }
5211 seq_putc(m, '\n');
5212 5429
5213 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
5214 seq_printf(m, "unevictable=%lu", unevictable_nr);
5215 for_each_node_state(nid, N_MEMORY) {
5216 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5217 BIT(LRU_UNEVICTABLE));
5218 seq_printf(m, " N%d=%lu", nid, node_nr);
5219 }
5220 seq_putc(m, '\n');
5221 return 0; 5430 return 0;
5222} 5431}
5223#endif /* CONFIG_NUMA */ 5432#endif /* CONFIG_NUMA */
@@ -5911,6 +6120,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5911 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6120 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5912 mz = &pn->zoneinfo[zone]; 6121 mz = &pn->zoneinfo[zone];
5913 lruvec_init(&mz->lruvec); 6122 lruvec_init(&mz->lruvec);
6123 mz->usage_in_excess = 0;
6124 mz->on_tree = false;
5914 mz->memcg = memcg; 6125 mz->memcg = memcg;
5915 } 6126 }
5916 memcg->nodeinfo[node] = pn; 6127 memcg->nodeinfo[node] = pn;
@@ -5966,7 +6177,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
5966 int node; 6177 int node;
5967 size_t size = memcg_size(); 6178 size_t size = memcg_size();
5968 6179
5969 free_css_id(&mem_cgroup_subsys, &memcg->css); 6180 mem_cgroup_remove_from_trees(memcg);
5970 6181
5971 for_each_node(node) 6182 for_each_node(node)
5972 free_mem_cgroup_per_zone_info(memcg, node); 6183 free_mem_cgroup_per_zone_info(memcg, node);
@@ -6002,6 +6213,29 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6002} 6213}
6003EXPORT_SYMBOL(parent_mem_cgroup); 6214EXPORT_SYMBOL(parent_mem_cgroup);
6004 6215
6216static void __init mem_cgroup_soft_limit_tree_init(void)
6217{
6218 struct mem_cgroup_tree_per_node *rtpn;
6219 struct mem_cgroup_tree_per_zone *rtpz;
6220 int tmp, node, zone;
6221
6222 for_each_node(node) {
6223 tmp = node;
6224 if (!node_state(node, N_NORMAL_MEMORY))
6225 tmp = -1;
6226 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6227 BUG_ON(!rtpn);
6228
6229 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6230
6231 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6232 rtpz = &rtpn->rb_tree_per_zone[zone];
6233 rtpz->rb_root = RB_ROOT;
6234 spin_lock_init(&rtpz->lock);
6235 }
6236 }
6237}
6238
6005static struct cgroup_subsys_state * __ref 6239static struct cgroup_subsys_state * __ref
6006mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6240mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6007{ 6241{
@@ -6031,7 +6265,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6031 mutex_init(&memcg->thresholds_lock); 6265 mutex_init(&memcg->thresholds_lock);
6032 spin_lock_init(&memcg->move_lock); 6266 spin_lock_init(&memcg->move_lock);
6033 vmpressure_init(&memcg->vmpressure); 6267 vmpressure_init(&memcg->vmpressure);
6034 spin_lock_init(&memcg->soft_lock);
6035 6268
6036 return &memcg->css; 6269 return &memcg->css;
6037 6270
@@ -6047,6 +6280,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
6047 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); 6280 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
6048 int error = 0; 6281 int error = 0;
6049 6282
6283 if (css->cgroup->id > MEM_CGROUP_ID_MAX)
6284 return -ENOSPC;
6285
6050 if (!parent) 6286 if (!parent)
6051 return 0; 6287 return 0;
6052 6288
@@ -6109,13 +6345,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6109 6345
6110 mem_cgroup_invalidate_reclaim_iterators(memcg); 6346 mem_cgroup_invalidate_reclaim_iterators(memcg);
6111 mem_cgroup_reparent_charges(memcg); 6347 mem_cgroup_reparent_charges(memcg);
6112 if (memcg->soft_contributed) {
6113 while ((memcg = parent_mem_cgroup(memcg)))
6114 atomic_dec(&memcg->children_in_excess);
6115
6116 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
6117 atomic_dec(&root_mem_cgroup->children_in_excess);
6118 }
6119 mem_cgroup_destroy_all_caches(memcg); 6348 mem_cgroup_destroy_all_caches(memcg);
6120 vmpressure_cleanup(&memcg->vmpressure); 6349 vmpressure_cleanup(&memcg->vmpressure);
6121} 6350}
@@ -6325,7 +6554,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
6325 } 6554 }
6326 /* There is a swap entry and a page doesn't exist or isn't charged */ 6555 /* There is a swap entry and a page doesn't exist or isn't charged */
6327 if (ent.val && !ret && 6556 if (ent.val && !ret &&
6328 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { 6557 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
6329 ret = MC_TARGET_SWAP; 6558 ret = MC_TARGET_SWAP;
6330 if (target) 6559 if (target)
6331 target->ent = ent; 6560 target->ent = ent;
@@ -6376,10 +6605,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
6376 pte_t *pte; 6605 pte_t *pte;
6377 spinlock_t *ptl; 6606 spinlock_t *ptl;
6378 6607
6379 if (pmd_trans_huge_lock(pmd, vma) == 1) { 6608 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
6380 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 6609 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
6381 mc.precharge += HPAGE_PMD_NR; 6610 mc.precharge += HPAGE_PMD_NR;
6382 spin_unlock(&vma->vm_mm->page_table_lock); 6611 spin_unlock(ptl);
6383 return 0; 6612 return 0;
6384 } 6613 }
6385 6614
@@ -6568,9 +6797,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6568 * to be unlocked in __split_huge_page_splitting(), where the main 6797 * to be unlocked in __split_huge_page_splitting(), where the main
6569 * part of thp split is not executed yet. 6798 * part of thp split is not executed yet.
6570 */ 6799 */
6571 if (pmd_trans_huge_lock(pmd, vma) == 1) { 6800 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
6572 if (mc.precharge < HPAGE_PMD_NR) { 6801 if (mc.precharge < HPAGE_PMD_NR) {
6573 spin_unlock(&vma->vm_mm->page_table_lock); 6802 spin_unlock(ptl);
6574 return 0; 6803 return 0;
6575 } 6804 }
6576 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 6805 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6587,7 +6816,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6587 } 6816 }
6588 put_page(page); 6817 put_page(page);
6589 } 6818 }
6590 spin_unlock(&vma->vm_mm->page_table_lock); 6819 spin_unlock(ptl);
6591 return 0; 6820 return 0;
6592 } 6821 }
6593 6822
@@ -6745,7 +6974,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
6745 .bind = mem_cgroup_bind, 6974 .bind = mem_cgroup_bind,
6746 .base_cftypes = mem_cgroup_files, 6975 .base_cftypes = mem_cgroup_files,
6747 .early_init = 0, 6976 .early_init = 0,
6748 .use_id = 1,
6749}; 6977};
6750 6978
6751#ifdef CONFIG_MEMCG_SWAP 6979#ifdef CONFIG_MEMCG_SWAP
@@ -6790,6 +7018,7 @@ static int __init mem_cgroup_init(void)
6790{ 7018{
6791 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 7019 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6792 enable_swap_cgroup(); 7020 enable_swap_cgroup();
7021 mem_cgroup_soft_limit_tree_init();
6793 memcg_stock_init(); 7022 memcg_stock_init();
6794 return 0; 7023 return 0;
6795} 7024}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 947ed5413279..b7c171602ba1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1114,8 +1114,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1114 * shake_page could have turned it free. 1114 * shake_page could have turned it free.
1115 */ 1115 */
1116 if (is_free_buddy_page(p)) { 1116 if (is_free_buddy_page(p)) {
1117 action_result(pfn, "free buddy, 2nd try", 1117 if (flags & MF_COUNT_INCREASED)
1118 DELAYED); 1118 action_result(pfn, "free buddy", DELAYED);
1119 else
1120 action_result(pfn, "free buddy, 2nd try", DELAYED);
1119 return 0; 1121 return 0;
1120 } 1122 }
1121 action_result(pfn, "non LRU", IGNORED); 1123 action_result(pfn, "non LRU", IGNORED);
@@ -1267,7 +1269,7 @@ void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1267 1269
1268 mf_cpu = &get_cpu_var(memory_failure_cpu); 1270 mf_cpu = &get_cpu_var(memory_failure_cpu);
1269 spin_lock_irqsave(&mf_cpu->lock, proc_flags); 1271 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1270 if (kfifo_put(&mf_cpu->fifo, &entry)) 1272 if (kfifo_put(&mf_cpu->fifo, entry))
1271 schedule_work_on(smp_processor_id(), &mf_cpu->work); 1273 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1272 else 1274 else
1273 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", 1275 pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
@@ -1349,7 +1351,7 @@ int unpoison_memory(unsigned long pfn)
1349 * worked by memory_failure() and the page lock is not held yet. 1351 * worked by memory_failure() and the page lock is not held yet.
1350 * In such case, we yield to memory_failure() and make unpoison fail. 1352 * In such case, we yield to memory_failure() and make unpoison fail.
1351 */ 1353 */
1352 if (PageTransHuge(page)) { 1354 if (!PageHuge(page) && PageTransHuge(page)) {
1353 pr_info("MCE: Memory failure is now running on %#lx\n", pfn); 1355 pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
1354 return 0; 1356 return 0;
1355 } 1357 }
@@ -1421,19 +1423,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1421 return 1; 1423 return 1;
1422 1424
1423 /* 1425 /*
1424 * The lock_memory_hotplug prevents a race with memory hotplug.
1425 * This is a big hammer, a better would be nicer.
1426 */
1427 lock_memory_hotplug();
1428
1429 /*
1430 * Isolate the page, so that it doesn't get reallocated if it
1431 * was free. This flag should be kept set until the source page
1432 * is freed and PG_hwpoison on it is set.
1433 */
1434 if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE)
1435 set_migratetype_isolate(p, true);
1436 /*
1437 * When the target page is a free hugepage, just remove it 1426 * When the target page is a free hugepage, just remove it
1438 * from free hugepage list. 1427 * from free hugepage list.
1439 */ 1428 */
@@ -1453,7 +1442,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1453 /* Not a free page */ 1442 /* Not a free page */
1454 ret = 1; 1443 ret = 1;
1455 } 1444 }
1456 unlock_memory_hotplug();
1457 return ret; 1445 return ret;
1458} 1446}
1459 1447
@@ -1652,15 +1640,28 @@ int soft_offline_page(struct page *page, int flags)
1652 } 1640 }
1653 } 1641 }
1654 1642
1643 /*
1644 * The lock_memory_hotplug prevents a race with memory hotplug.
1645 * This is a big hammer, a better would be nicer.
1646 */
1647 lock_memory_hotplug();
1648
1649 /*
1650 * Isolate the page, so that it doesn't get reallocated if it
1651 * was free. This flag should be kept set until the source page
1652 * is freed and PG_hwpoison on it is set.
1653 */
1654 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
1655 set_migratetype_isolate(page, true);
1656
1655 ret = get_any_page(page, pfn, flags); 1657 ret = get_any_page(page, pfn, flags);
1656 if (ret < 0) 1658 unlock_memory_hotplug();
1657 goto unset; 1659 if (ret > 0) { /* for in-use pages */
1658 if (ret) { /* for in-use pages */
1659 if (PageHuge(page)) 1660 if (PageHuge(page))
1660 ret = soft_offline_huge_page(page, flags); 1661 ret = soft_offline_huge_page(page, flags);
1661 else 1662 else
1662 ret = __soft_offline_page(page, flags); 1663 ret = __soft_offline_page(page, flags);
1663 } else { /* for free pages */ 1664 } else if (ret == 0) { /* for free pages */
1664 if (PageHuge(page)) { 1665 if (PageHuge(page)) {
1665 set_page_hwpoison_huge_page(hpage); 1666 set_page_hwpoison_huge_page(hpage);
1666 dequeue_hwpoisoned_huge_page(hpage); 1667 dequeue_hwpoisoned_huge_page(hpage);
@@ -1671,7 +1672,6 @@ int soft_offline_page(struct page *page, int flags)
1671 atomic_long_inc(&num_poisoned_pages); 1672 atomic_long_inc(&num_poisoned_pages);
1672 } 1673 }
1673 } 1674 }
1674unset:
1675 unset_migratetype_isolate(page, MIGRATE_MOVABLE); 1675 unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1676 return ret; 1676 return ret;
1677} 1677}
diff --git a/mm/memory.c b/mm/memory.c
index ca0003947115..0409e8f43fa0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,8 +69,8 @@
69 69
70#include "internal.h" 70#include "internal.h"
71 71
72#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 72#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. 73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
74#endif 74#endif
75 75
76#ifndef CONFIG_NEED_MULTIPLE_NODES 76#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
382 pgtable_t token = pmd_pgtable(*pmd); 382 pgtable_t token = pmd_pgtable(*pmd);
383 pmd_clear(pmd); 383 pmd_clear(pmd);
384 pte_free_tlb(tlb, token, addr); 384 pte_free_tlb(tlb, token, addr);
385 tlb->mm->nr_ptes--; 385 atomic_long_dec(&tlb->mm->nr_ptes);
386} 386}
387 387
388static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 388static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -453,8 +453,6 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
453 453
454/* 454/*
455 * This function frees user-level page tables of a process. 455 * This function frees user-level page tables of a process.
456 *
457 * Must be called with pagetable lock held.
458 */ 456 */
459void free_pgd_range(struct mmu_gather *tlb, 457void free_pgd_range(struct mmu_gather *tlb,
460 unsigned long addr, unsigned long end, 458 unsigned long addr, unsigned long end,
@@ -552,6 +550,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
552int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 550int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
553 pmd_t *pmd, unsigned long address) 551 pmd_t *pmd, unsigned long address)
554{ 552{
553 spinlock_t *ptl;
555 pgtable_t new = pte_alloc_one(mm, address); 554 pgtable_t new = pte_alloc_one(mm, address);
556 int wait_split_huge_page; 555 int wait_split_huge_page;
557 if (!new) 556 if (!new)
@@ -572,15 +571,15 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
572 */ 571 */
573 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ 572 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
574 573
575 spin_lock(&mm->page_table_lock); 574 ptl = pmd_lock(mm, pmd);
576 wait_split_huge_page = 0; 575 wait_split_huge_page = 0;
577 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 576 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
578 mm->nr_ptes++; 577 atomic_long_inc(&mm->nr_ptes);
579 pmd_populate(mm, pmd, new); 578 pmd_populate(mm, pmd, new);
580 new = NULL; 579 new = NULL;
581 } else if (unlikely(pmd_trans_splitting(*pmd))) 580 } else if (unlikely(pmd_trans_splitting(*pmd)))
582 wait_split_huge_page = 1; 581 wait_split_huge_page = 1;
583 spin_unlock(&mm->page_table_lock); 582 spin_unlock(ptl);
584 if (new) 583 if (new)
585 pte_free(mm, new); 584 pte_free(mm, new);
586 if (wait_split_huge_page) 585 if (wait_split_huge_page)
@@ -681,7 +680,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
681 if (vma->vm_ops) 680 if (vma->vm_ops)
682 printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", 681 printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
683 vma->vm_ops->fault); 682 vma->vm_ops->fault);
684 if (vma->vm_file && vma->vm_file->f_op) 683 if (vma->vm_file)
685 printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", 684 printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
686 vma->vm_file->f_op->mmap); 685 vma->vm_file->f_op->mmap);
687 dump_stack(); 686 dump_stack();
@@ -837,6 +836,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
837 */ 836 */
838 make_migration_entry_read(&entry); 837 make_migration_entry_read(&entry);
839 pte = swp_entry_to_pte(entry); 838 pte = swp_entry_to_pte(entry);
839 if (pte_swp_soft_dirty(*src_pte))
840 pte = pte_swp_mksoft_dirty(pte);
840 set_pte_at(src_mm, addr, src_pte, pte); 841 set_pte_at(src_mm, addr, src_pte, pte);
841 } 842 }
842 } 843 }
@@ -1516,20 +1517,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
1516 split_huge_page_pmd(vma, address, pmd); 1517 split_huge_page_pmd(vma, address, pmd);
1517 goto split_fallthrough; 1518 goto split_fallthrough;
1518 } 1519 }
1519 spin_lock(&mm->page_table_lock); 1520 ptl = pmd_lock(mm, pmd);
1520 if (likely(pmd_trans_huge(*pmd))) { 1521 if (likely(pmd_trans_huge(*pmd))) {
1521 if (unlikely(pmd_trans_splitting(*pmd))) { 1522 if (unlikely(pmd_trans_splitting(*pmd))) {
1522 spin_unlock(&mm->page_table_lock); 1523 spin_unlock(ptl);
1523 wait_split_huge_page(vma->anon_vma, pmd); 1524 wait_split_huge_page(vma->anon_vma, pmd);
1524 } else { 1525 } else {
1525 page = follow_trans_huge_pmd(vma, address, 1526 page = follow_trans_huge_pmd(vma, address,
1526 pmd, flags); 1527 pmd, flags);
1527 spin_unlock(&mm->page_table_lock); 1528 spin_unlock(ptl);
1528 *page_mask = HPAGE_PMD_NR - 1; 1529 *page_mask = HPAGE_PMD_NR - 1;
1529 goto out; 1530 goto out;
1530 } 1531 }
1531 } else 1532 } else
1532 spin_unlock(&mm->page_table_lock); 1533 spin_unlock(ptl);
1533 /* fall through */ 1534 /* fall through */
1534 } 1535 }
1535split_fallthrough: 1536split_fallthrough:
@@ -2719,6 +2720,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2719 get_page(dirty_page); 2720 get_page(dirty_page);
2720 2721
2721reuse: 2722reuse:
2723 /*
2724 * Clear the pages cpupid information as the existing
2725 * information potentially belongs to a now completely
2726 * unrelated process.
2727 */
2728 if (old_page)
2729 page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
2730
2722 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2731 flush_cache_page(vma, address, pte_pfn(orig_pte));
2723 entry = pte_mkyoung(orig_pte); 2732 entry = pte_mkyoung(orig_pte);
2724 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2733 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -3519,13 +3528,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3519} 3528}
3520 3529
3521int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3530int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3522 unsigned long addr, int current_nid) 3531 unsigned long addr, int page_nid,
3532 int *flags)
3523{ 3533{
3524 get_page(page); 3534 get_page(page);
3525 3535
3526 count_vm_numa_event(NUMA_HINT_FAULTS); 3536 count_vm_numa_event(NUMA_HINT_FAULTS);
3527 if (current_nid == numa_node_id()) 3537 if (page_nid == numa_node_id()) {
3528 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); 3538 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
3539 *flags |= TNF_FAULT_LOCAL;
3540 }
3529 3541
3530 return mpol_misplaced(page, vma, addr); 3542 return mpol_misplaced(page, vma, addr);
3531} 3543}
@@ -3535,9 +3547,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3535{ 3547{
3536 struct page *page = NULL; 3548 struct page *page = NULL;
3537 spinlock_t *ptl; 3549 spinlock_t *ptl;
3538 int current_nid = -1; 3550 int page_nid = -1;
3551 int last_cpupid;
3539 int target_nid; 3552 int target_nid;
3540 bool migrated = false; 3553 bool migrated = false;
3554 int flags = 0;
3541 3555
3542 /* 3556 /*
3543 * The "pte" at this point cannot be used safely without 3557 * The "pte" at this point cannot be used safely without
@@ -3564,123 +3578,44 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3564 pte_unmap_unlock(ptep, ptl); 3578 pte_unmap_unlock(ptep, ptl);
3565 return 0; 3579 return 0;
3566 } 3580 }
3581 BUG_ON(is_zero_pfn(page_to_pfn(page)));
3567 3582
3568 current_nid = page_to_nid(page); 3583 /*
3569 target_nid = numa_migrate_prep(page, vma, addr, current_nid); 3584 * Avoid grouping on DSO/COW pages in specific and RO pages
3585 * in general, RO pages shouldn't hurt as much anyway since
3586 * they can be in shared cache state.
3587 */
3588 if (!pte_write(pte))
3589 flags |= TNF_NO_GROUP;
3590
3591 /*
3592 * Flag if the page is shared between multiple address spaces. This
3593 * is later used when determining whether to group tasks together
3594 */
3595 if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
3596 flags |= TNF_SHARED;
3597
3598 last_cpupid = page_cpupid_last(page);
3599 page_nid = page_to_nid(page);
3600 target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
3570 pte_unmap_unlock(ptep, ptl); 3601 pte_unmap_unlock(ptep, ptl);
3571 if (target_nid == -1) { 3602 if (target_nid == -1) {
3572 /*
3573 * Account for the fault against the current node if it not
3574 * being replaced regardless of where the page is located.
3575 */
3576 current_nid = numa_node_id();
3577 put_page(page); 3603 put_page(page);
3578 goto out; 3604 goto out;
3579 } 3605 }
3580 3606
3581 /* Migrate to the requested node */ 3607 /* Migrate to the requested node */
3582 migrated = migrate_misplaced_page(page, target_nid); 3608 migrated = migrate_misplaced_page(page, vma, target_nid);
3583 if (migrated) 3609 if (migrated) {
3584 current_nid = target_nid; 3610 page_nid = target_nid;
3585 3611 flags |= TNF_MIGRATED;
3586out:
3587 if (current_nid != -1)
3588 task_numa_fault(current_nid, 1, migrated);
3589 return 0;
3590}
3591
3592/* NUMA hinting page fault entry point for regular pmds */
3593#ifdef CONFIG_NUMA_BALANCING
3594static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3595 unsigned long addr, pmd_t *pmdp)
3596{
3597 pmd_t pmd;
3598 pte_t *pte, *orig_pte;
3599 unsigned long _addr = addr & PMD_MASK;
3600 unsigned long offset;
3601 spinlock_t *ptl;
3602 bool numa = false;
3603 int local_nid = numa_node_id();
3604
3605 spin_lock(&mm->page_table_lock);
3606 pmd = *pmdp;
3607 if (pmd_numa(pmd)) {
3608 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3609 numa = true;
3610 }
3611 spin_unlock(&mm->page_table_lock);
3612
3613 if (!numa)
3614 return 0;
3615
3616 /* we're in a page fault so some vma must be in the range */
3617 BUG_ON(!vma);
3618 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3619 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3620 VM_BUG_ON(offset >= PMD_SIZE);
3621 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3622 pte += offset >> PAGE_SHIFT;
3623 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3624 pte_t pteval = *pte;
3625 struct page *page;
3626 int curr_nid = local_nid;
3627 int target_nid;
3628 bool migrated;
3629 if (!pte_present(pteval))
3630 continue;
3631 if (!pte_numa(pteval))
3632 continue;
3633 if (addr >= vma->vm_end) {
3634 vma = find_vma(mm, addr);
3635 /* there's a pte present so there must be a vma */
3636 BUG_ON(!vma);
3637 BUG_ON(addr < vma->vm_start);
3638 }
3639 if (pte_numa(pteval)) {
3640 pteval = pte_mknonnuma(pteval);
3641 set_pte_at(mm, addr, pte, pteval);
3642 }
3643 page = vm_normal_page(vma, addr, pteval);
3644 if (unlikely(!page))
3645 continue;
3646 /* only check non-shared pages */
3647 if (unlikely(page_mapcount(page) != 1))
3648 continue;
3649
3650 /*
3651 * Note that the NUMA fault is later accounted to either
3652 * the node that is currently running or where the page is
3653 * migrated to.
3654 */
3655 curr_nid = local_nid;
3656 target_nid = numa_migrate_prep(page, vma, addr,
3657 page_to_nid(page));
3658 if (target_nid == -1) {
3659 put_page(page);
3660 continue;
3661 }
3662
3663 /* Migrate to the requested node */
3664 pte_unmap_unlock(pte, ptl);
3665 migrated = migrate_misplaced_page(page, target_nid);
3666 if (migrated)
3667 curr_nid = target_nid;
3668 task_numa_fault(curr_nid, 1, migrated);
3669
3670 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3671 } 3612 }
3672 pte_unmap_unlock(orig_pte, ptl);
3673 3613
3614out:
3615 if (page_nid != -1)
3616 task_numa_fault(last_cpupid, page_nid, 1, flags);
3674 return 0; 3617 return 0;
3675} 3618}
3676#else
3677static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3678 unsigned long addr, pmd_t *pmdp)
3679{
3680 BUG();
3681 return 0;
3682}
3683#endif /* CONFIG_NUMA_BALANCING */
3684 3619
3685/* 3620/*
3686 * These routines also need to handle stuff like marking pages dirty 3621 * These routines also need to handle stuff like marking pages dirty
@@ -3820,8 +3755,8 @@ retry:
3820 } 3755 }
3821 } 3756 }
3822 3757
3823 if (pmd_numa(*pmd)) 3758 /* THP should already have been handled */
3824 return do_pmd_numa_page(mm, vma, address, pmd); 3759 BUG_ON(pmd_numa(*pmd));
3825 3760
3826 /* 3761 /*
3827 * Use __pte_alloc instead of pte_alloc_map, because we can't 3762 * Use __pte_alloc instead of pte_alloc_map, because we can't
@@ -3863,15 +3798,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3863 * space. Kernel faults are handled more gracefully. 3798 * space. Kernel faults are handled more gracefully.
3864 */ 3799 */
3865 if (flags & FAULT_FLAG_USER) 3800 if (flags & FAULT_FLAG_USER)
3866 mem_cgroup_enable_oom(); 3801 mem_cgroup_oom_enable();
3867 3802
3868 ret = __handle_mm_fault(mm, vma, address, flags); 3803 ret = __handle_mm_fault(mm, vma, address, flags);
3869 3804
3870 if (flags & FAULT_FLAG_USER) 3805 if (flags & FAULT_FLAG_USER) {
3871 mem_cgroup_disable_oom(); 3806 mem_cgroup_oom_disable();
3872 3807 /*
3873 if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) 3808 * The task may have entered a memcg OOM situation but
3874 mem_cgroup_oom_synchronize(); 3809 * if the allocation error was handled gracefully (no
3810 * VM_FAULT_OOM), there is no need to kill anything.
3811 * Just clean up the OOM state peacefully.
3812 */
3813 if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
3814 mem_cgroup_oom_synchronize(false);
3815 }
3875 3816
3876 return ret; 3817 return ret;
3877} 3818}
@@ -4329,3 +4270,28 @@ void copy_user_huge_page(struct page *dst, struct page *src,
4329 } 4270 }
4330} 4271}
4331#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 4272#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
4273
4274#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS
4275static struct kmem_cache *page_ptl_cachep;
4276void __init ptlock_cache_init(void)
4277{
4278 page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
4279 SLAB_PANIC, NULL);
4280}
4281
4282bool ptlock_alloc(struct page *page)
4283{
4284 spinlock_t *ptl;
4285
4286 ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
4287 if (!ptl)
4288 return false;
4289 page->ptl = ptl;
4290 return true;
4291}
4292
4293void ptlock_free(struct page *page)
4294{
4295 kfree(page->ptl);
4296}
4297#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ed85fe3870e2..489f235502db 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -31,6 +31,7 @@
31#include <linux/firmware-map.h> 31#include <linux/firmware-map.h>
32#include <linux/stop_machine.h> 32#include <linux/stop_machine.h>
33#include <linux/hugetlb.h> 33#include <linux/hugetlb.h>
34#include <linux/memblock.h>
34 35
35#include <asm/tlbflush.h> 36#include <asm/tlbflush.h>
36 37
@@ -365,8 +366,7 @@ out_fail:
365static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 366static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
366 unsigned long end_pfn) 367 unsigned long end_pfn)
367{ 368{
368 unsigned long old_pgdat_end_pfn = 369 unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
369 pgdat->node_start_pfn + pgdat->node_spanned_pages;
370 370
371 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) 371 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
372 pgdat->node_start_pfn = start_pfn; 372 pgdat->node_start_pfn = start_pfn;
@@ -402,13 +402,12 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
402static int __meminit __add_section(int nid, struct zone *zone, 402static int __meminit __add_section(int nid, struct zone *zone,
403 unsigned long phys_start_pfn) 403 unsigned long phys_start_pfn)
404{ 404{
405 int nr_pages = PAGES_PER_SECTION;
406 int ret; 405 int ret;
407 406
408 if (pfn_valid(phys_start_pfn)) 407 if (pfn_valid(phys_start_pfn))
409 return -EEXIST; 408 return -EEXIST;
410 409
411 ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); 410 ret = sparse_add_one_section(zone, phys_start_pfn);
412 411
413 if (ret < 0) 412 if (ret < 0)
414 return ret; 413 return ret;
@@ -579,9 +578,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
579static void shrink_pgdat_span(struct pglist_data *pgdat, 578static void shrink_pgdat_span(struct pglist_data *pgdat,
580 unsigned long start_pfn, unsigned long end_pfn) 579 unsigned long start_pfn, unsigned long end_pfn)
581{ 580{
582 unsigned long pgdat_start_pfn = pgdat->node_start_pfn; 581 unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
583 unsigned long pgdat_end_pfn = 582 unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
584 pgdat->node_start_pfn + pgdat->node_spanned_pages; 583 unsigned long pgdat_end_pfn = p;
585 unsigned long pfn; 584 unsigned long pfn;
586 struct mem_section *ms; 585 struct mem_section *ms;
587 int nid = pgdat->node_id; 586 int nid = pgdat->node_id;
@@ -935,7 +934,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
935 arg.nr_pages = nr_pages; 934 arg.nr_pages = nr_pages;
936 node_states_check_changes_online(nr_pages, zone, &arg); 935 node_states_check_changes_online(nr_pages, zone, &arg);
937 936
938 nid = page_to_nid(pfn_to_page(pfn)); 937 nid = pfn_to_nid(pfn);
939 938
940 ret = memory_notify(MEM_GOING_ONLINE, &arg); 939 ret = memory_notify(MEM_GOING_ONLINE, &arg);
941 ret = notifier_to_errno(ret); 940 ret = notifier_to_errno(ret);
@@ -1044,17 +1043,23 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
1044} 1043}
1045 1044
1046 1045
1047/* 1046/**
1047 * try_online_node - online a node if offlined
1048 *
1048 * called by cpu_up() to online a node without onlined memory. 1049 * called by cpu_up() to online a node without onlined memory.
1049 */ 1050 */
1050int mem_online_node(int nid) 1051int try_online_node(int nid)
1051{ 1052{
1052 pg_data_t *pgdat; 1053 pg_data_t *pgdat;
1053 int ret; 1054 int ret;
1054 1055
1056 if (node_online(nid))
1057 return 0;
1058
1055 lock_memory_hotplug(); 1059 lock_memory_hotplug();
1056 pgdat = hotadd_new_pgdat(nid, 0); 1060 pgdat = hotadd_new_pgdat(nid, 0);
1057 if (!pgdat) { 1061 if (!pgdat) {
1062 pr_err("Cannot online node %d due to NULL pgdat\n", nid);
1058 ret = -ENOMEM; 1063 ret = -ENOMEM;
1059 goto out; 1064 goto out;
1060 } 1065 }
@@ -1062,6 +1067,12 @@ int mem_online_node(int nid)
1062 ret = register_one_node(nid); 1067 ret = register_one_node(nid);
1063 BUG_ON(ret); 1068 BUG_ON(ret);
1064 1069
1070 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
1071 mutex_lock(&zonelists_mutex);
1072 build_all_zonelists(NULL, NULL);
1073 mutex_unlock(&zonelists_mutex);
1074 }
1075
1065out: 1076out:
1066 unlock_memory_hotplug(); 1077 unlock_memory_hotplug();
1067 return ret; 1078 return ret;
@@ -1412,6 +1423,36 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1412} 1423}
1413#endif /* CONFIG_MOVABLE_NODE */ 1424#endif /* CONFIG_MOVABLE_NODE */
1414 1425
1426static int __init cmdline_parse_movable_node(char *p)
1427{
1428#ifdef CONFIG_MOVABLE_NODE
1429 /*
1430 * Memory used by the kernel cannot be hot-removed because Linux
1431 * cannot migrate the kernel pages. When memory hotplug is
1432 * enabled, we should prevent memblock from allocating memory
1433 * for the kernel.
1434 *
1435 * ACPI SRAT records all hotpluggable memory ranges. But before
1436 * SRAT is parsed, we don't know about it.
1437 *
1438 * The kernel image is loaded into memory at very early time. We
1439 * cannot prevent this anyway. So on NUMA system, we set any
1440 * node the kernel resides in as un-hotpluggable.
1441 *
1442 * Since on modern servers, one node could have double-digit
1443 * gigabytes memory, we can assume the memory around the kernel
1444 * image is also un-hotpluggable. So before SRAT is parsed, just
1445 * allocate memory near the kernel image to try the best to keep
1446 * the kernel away from hotpluggable memory.
1447 */
1448 memblock_set_bottom_up(true);
1449#else
1450 pr_warn("movable_node option not supported\n");
1451#endif
1452 return 0;
1453}
1454early_param("movable_node", cmdline_parse_movable_node);
1455
1415/* check which state of node_states will be changed when offline memory */ 1456/* check which state of node_states will be changed when offline memory */
1416static void node_states_check_changes_offline(unsigned long nr_pages, 1457static void node_states_check_changes_offline(unsigned long nr_pages,
1417 struct zone *zone, struct memory_notify *arg) 1458 struct zone *zone, struct memory_notify *arg)
@@ -1702,7 +1743,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1702} 1743}
1703 1744
1704#ifdef CONFIG_MEMORY_HOTREMOVE 1745#ifdef CONFIG_MEMORY_HOTREMOVE
1705static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) 1746static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
1706{ 1747{
1707 int ret = !is_memblock_offlined(mem); 1748 int ret = !is_memblock_offlined(mem);
1708 1749
@@ -1854,7 +1895,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
1854 * if this is not the case. 1895 * if this is not the case.
1855 */ 1896 */
1856 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, 1897 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
1857 is_memblock_offlined_cb); 1898 check_memblock_offlined_cb);
1858 if (ret) { 1899 if (ret) {
1859 unlock_memory_hotplug(); 1900 unlock_memory_hotplug();
1860 BUG(); 1901 BUG();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 04729647f359..c4403cdf3433 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -525,8 +525,9 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
525#ifdef CONFIG_HUGETLB_PAGE 525#ifdef CONFIG_HUGETLB_PAGE
526 int nid; 526 int nid;
527 struct page *page; 527 struct page *page;
528 spinlock_t *ptl;
528 529
529 spin_lock(&vma->vm_mm->page_table_lock); 530 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
530 page = pte_page(huge_ptep_get((pte_t *)pmd)); 531 page = pte_page(huge_ptep_get((pte_t *)pmd));
531 nid = page_to_nid(page); 532 nid = page_to_nid(page);
532 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 533 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -536,7 +537,7 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
536 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) 537 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
537 isolate_huge_page(page, private); 538 isolate_huge_page(page, private);
538unlock: 539unlock:
539 spin_unlock(&vma->vm_mm->page_table_lock); 540 spin_unlock(ptl);
540#else 541#else
541 BUG(); 542 BUG();
542#endif 543#endif
@@ -1125,7 +1126,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1125 tmp = *from; 1126 tmp = *from;
1126 while (!nodes_empty(tmp)) { 1127 while (!nodes_empty(tmp)) {
1127 int s,d; 1128 int s,d;
1128 int source = -1; 1129 int source = NUMA_NO_NODE;
1129 int dest = 0; 1130 int dest = 0;
1130 1131
1131 for_each_node_mask(s, tmp) { 1132 for_each_node_mask(s, tmp) {
@@ -1160,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1160 if (!node_isset(dest, tmp)) 1161 if (!node_isset(dest, tmp))
1161 break; 1162 break;
1162 } 1163 }
1163 if (source == -1) 1164 if (source == NUMA_NO_NODE)
1164 break; 1165 break;
1165 1166
1166 node_clear(source, tmp); 1167 node_clear(source, tmp);
@@ -1679,6 +1680,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
1679 return pol; 1680 return pol;
1680} 1681}
1681 1682
1683bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
1684{
1685 struct mempolicy *pol = get_task_policy(task);
1686 if (vma) {
1687 if (vma->vm_ops && vma->vm_ops->get_policy) {
1688 bool ret = false;
1689
1690 pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1691 if (pol && (pol->flags & MPOL_F_MOF))
1692 ret = true;
1693 mpol_cond_put(pol);
1694
1695 return ret;
1696 } else if (vma->vm_policy) {
1697 pol = vma->vm_policy;
1698 }
1699 }
1700
1701 if (!pol)
1702 return default_policy.flags & MPOL_F_MOF;
1703
1704 return pol->flags & MPOL_F_MOF;
1705}
1706
1682static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) 1707static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1683{ 1708{
1684 enum zone_type dynamic_policy_zone = policy_zone; 1709 enum zone_type dynamic_policy_zone = policy_zone;
@@ -1811,7 +1836,7 @@ static unsigned offset_il_node(struct mempolicy *pol,
1811 unsigned nnodes = nodes_weight(pol->v.nodes); 1836 unsigned nnodes = nodes_weight(pol->v.nodes);
1812 unsigned target; 1837 unsigned target;
1813 int c; 1838 int c;
1814 int nid = -1; 1839 int nid = NUMA_NO_NODE;
1815 1840
1816 if (!nnodes) 1841 if (!nnodes)
1817 return numa_node_id(); 1842 return numa_node_id();
@@ -1848,11 +1873,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1848 1873
1849/* 1874/*
1850 * Return the bit number of a random bit set in the nodemask. 1875 * Return the bit number of a random bit set in the nodemask.
1851 * (returns -1 if nodemask is empty) 1876 * (returns NUMA_NO_NODE if nodemask is empty)
1852 */ 1877 */
1853int node_random(const nodemask_t *maskp) 1878int node_random(const nodemask_t *maskp)
1854{ 1879{
1855 int w, bit = -1; 1880 int w, bit = NUMA_NO_NODE;
1856 1881
1857 w = nodes_weight(*maskp); 1882 w = nodes_weight(*maskp);
1858 if (w) 1883 if (w)
@@ -2277,6 +2302,35 @@ static void sp_free(struct sp_node *n)
2277 kmem_cache_free(sn_cache, n); 2302 kmem_cache_free(sn_cache, n);
2278} 2303}
2279 2304
2305#ifdef CONFIG_NUMA_BALANCING
2306static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2307{
2308 /* Never defer a private fault */
2309 if (cpupid_match_pid(p, last_cpupid))
2310 return false;
2311
2312 if (p->numa_migrate_deferred) {
2313 p->numa_migrate_deferred--;
2314 return true;
2315 }
2316 return false;
2317}
2318
2319static inline void defer_numa_migrate(struct task_struct *p)
2320{
2321 p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
2322}
2323#else
2324static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
2325{
2326 return false;
2327}
2328
2329static inline void defer_numa_migrate(struct task_struct *p)
2330{
2331}
2332#endif /* CONFIG_NUMA_BALANCING */
2333
2280/** 2334/**
2281 * mpol_misplaced - check whether current page node is valid in policy 2335 * mpol_misplaced - check whether current page node is valid in policy
2282 * 2336 *
@@ -2300,6 +2354,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2300 struct zone *zone; 2354 struct zone *zone;
2301 int curnid = page_to_nid(page); 2355 int curnid = page_to_nid(page);
2302 unsigned long pgoff; 2356 unsigned long pgoff;
2357 int thiscpu = raw_smp_processor_id();
2358 int thisnid = cpu_to_node(thiscpu);
2303 int polnid = -1; 2359 int polnid = -1;
2304 int ret = -1; 2360 int ret = -1;
2305 2361
@@ -2348,9 +2404,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2348 2404
2349 /* Migrate the page towards the node whose CPU is referencing it */ 2405 /* Migrate the page towards the node whose CPU is referencing it */
2350 if (pol->flags & MPOL_F_MORON) { 2406 if (pol->flags & MPOL_F_MORON) {
2351 int last_nid; 2407 int last_cpupid;
2408 int this_cpupid;
2352 2409
2353 polnid = numa_node_id(); 2410 polnid = thisnid;
2411 this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
2354 2412
2355 /* 2413 /*
2356 * Multi-stage node selection is used in conjunction 2414 * Multi-stage node selection is used in conjunction
@@ -2373,8 +2431,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2373 * it less likely we act on an unlikely task<->page 2431 * it less likely we act on an unlikely task<->page
2374 * relation. 2432 * relation.
2375 */ 2433 */
2376 last_nid = page_nid_xchg_last(page, polnid); 2434 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
2377 if (last_nid != polnid) 2435 if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
2436
2437 /* See sysctl_numa_balancing_migrate_deferred comment */
2438 if (!cpupid_match_pid(current, last_cpupid))
2439 defer_numa_migrate(current);
2440
2441 goto out;
2442 }
2443
2444 /*
2445 * The quadratic filter above reduces extraneous migration
2446 * of shared pages somewhat. This code reduces it even more,
2447 * reducing the overhead of page migrations of shared pages.
2448 * This makes workloads with shared pages rely more on
2449 * "move task near its memory", and less on "move memory
2450 * towards its task", which is exactly what we want.
2451 */
2452 if (numa_migrate_deferred(current, last_cpupid))
2378 goto out; 2453 goto out;
2379 } 2454 }
2380 2455
@@ -2840,62 +2915,45 @@ out:
2840 * @maxlen: length of @buffer 2915 * @maxlen: length of @buffer
2841 * @pol: pointer to mempolicy to be formatted 2916 * @pol: pointer to mempolicy to be formatted
2842 * 2917 *
2843 * Convert a mempolicy into a string. 2918 * Convert @pol into a string. If @buffer is too short, truncate the string.
2844 * Returns the number of characters in buffer (if positive) 2919 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2845 * or an error (negative) 2920 * longest flag, "relative", and to display at least a few node ids.
2846 */ 2921 */
2847int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 2922void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2848{ 2923{
2849 char *p = buffer; 2924 char *p = buffer;
2850 int l; 2925 nodemask_t nodes = NODE_MASK_NONE;
2851 nodemask_t nodes; 2926 unsigned short mode = MPOL_DEFAULT;
2852 unsigned short mode; 2927 unsigned short flags = 0;
2853 unsigned short flags = pol ? pol->flags : 0;
2854 2928
2855 /* 2929 if (pol && pol != &default_policy) {
2856 * Sanity check: room for longest mode, flag and some nodes
2857 */
2858 VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2859
2860 if (!pol || pol == &default_policy)
2861 mode = MPOL_DEFAULT;
2862 else
2863 mode = pol->mode; 2930 mode = pol->mode;
2931 flags = pol->flags;
2932 }
2864 2933
2865 switch (mode) { 2934 switch (mode) {
2866 case MPOL_DEFAULT: 2935 case MPOL_DEFAULT:
2867 nodes_clear(nodes);
2868 break; 2936 break;
2869
2870 case MPOL_PREFERRED: 2937 case MPOL_PREFERRED:
2871 nodes_clear(nodes);
2872 if (flags & MPOL_F_LOCAL) 2938 if (flags & MPOL_F_LOCAL)
2873 mode = MPOL_LOCAL; 2939 mode = MPOL_LOCAL;
2874 else 2940 else
2875 node_set(pol->v.preferred_node, nodes); 2941 node_set(pol->v.preferred_node, nodes);
2876 break; 2942 break;
2877
2878 case MPOL_BIND: 2943 case MPOL_BIND:
2879 /* Fall through */
2880 case MPOL_INTERLEAVE: 2944 case MPOL_INTERLEAVE:
2881 nodes = pol->v.nodes; 2945 nodes = pol->v.nodes;
2882 break; 2946 break;
2883
2884 default: 2947 default:
2885 return -EINVAL; 2948 WARN_ON_ONCE(1);
2949 snprintf(p, maxlen, "unknown");
2950 return;
2886 } 2951 }
2887 2952
2888 l = strlen(policy_modes[mode]); 2953 p += snprintf(p, maxlen, policy_modes[mode]);
2889 if (buffer + maxlen < p + l + 1)
2890 return -ENOSPC;
2891
2892 strcpy(p, policy_modes[mode]);
2893 p += l;
2894 2954
2895 if (flags & MPOL_MODE_FLAGS) { 2955 if (flags & MPOL_MODE_FLAGS) {
2896 if (buffer + maxlen < p + 2) 2956 p += snprintf(p, buffer + maxlen - p, "=");
2897 return -ENOSPC;
2898 *p++ = '=';
2899 2957
2900 /* 2958 /*
2901 * Currently, the only defined flags are mutually exclusive 2959 * Currently, the only defined flags are mutually exclusive
@@ -2907,10 +2965,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2907 } 2965 }
2908 2966
2909 if (!nodes_empty(nodes)) { 2967 if (!nodes_empty(nodes)) {
2910 if (buffer + maxlen < p + 2) 2968 p += snprintf(p, buffer + maxlen - p, ":");
2911 return -ENOSPC;
2912 *p++ = ':';
2913 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); 2969 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2914 } 2970 }
2915 return p - buffer;
2916} 2971}
diff --git a/mm/migrate.c b/mm/migrate.c
index 9c8d5f59d30b..316e720a2023 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -107,7 +107,7 @@ void putback_movable_pages(struct list_head *l)
107 list_del(&page->lru); 107 list_del(&page->lru);
108 dec_zone_page_state(page, NR_ISOLATED_ANON + 108 dec_zone_page_state(page, NR_ISOLATED_ANON +
109 page_is_file_cache(page)); 109 page_is_file_cache(page));
110 if (unlikely(balloon_page_movable(page))) 110 if (unlikely(isolated_balloon_page(page)))
111 balloon_page_putback(page); 111 balloon_page_putback(page);
112 else 112 else
113 putback_lru_page(page); 113 putback_lru_page(page);
@@ -130,7 +130,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
130 ptep = huge_pte_offset(mm, addr); 130 ptep = huge_pte_offset(mm, addr);
131 if (!ptep) 131 if (!ptep)
132 goto out; 132 goto out;
133 ptl = &mm->page_table_lock; 133 ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
134 } else { 134 } else {
135 pmd = mm_find_pmd(mm, addr); 135 pmd = mm_find_pmd(mm, addr);
136 if (!pmd) 136 if (!pmd)
@@ -161,6 +161,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
161 161
162 get_page(new); 162 get_page(new);
163 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 163 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
164 if (pte_swp_soft_dirty(*ptep))
165 pte = pte_mksoft_dirty(pte);
164 if (is_write_migration_entry(entry)) 166 if (is_write_migration_entry(entry))
165 pte = pte_mkwrite(pte); 167 pte = pte_mkwrite(pte);
166#ifdef CONFIG_HUGETLB_PAGE 168#ifdef CONFIG_HUGETLB_PAGE
@@ -247,9 +249,10 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
247 __migration_entry_wait(mm, ptep, ptl); 249 __migration_entry_wait(mm, ptep, ptl);
248} 250}
249 251
250void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) 252void migration_entry_wait_huge(struct vm_area_struct *vma,
253 struct mm_struct *mm, pte_t *pte)
251{ 254{
252 spinlock_t *ptl = &(mm)->page_table_lock; 255 spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
253 __migration_entry_wait(mm, pte, ptl); 256 __migration_entry_wait(mm, pte, ptl);
254} 257}
255 258
@@ -443,6 +446,8 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
443 */ 446 */
444void migrate_page_copy(struct page *newpage, struct page *page) 447void migrate_page_copy(struct page *newpage, struct page *page)
445{ 448{
449 int cpupid;
450
446 if (PageHuge(page) || PageTransHuge(page)) 451 if (PageHuge(page) || PageTransHuge(page))
447 copy_huge_page(newpage, page); 452 copy_huge_page(newpage, page);
448 else 453 else
@@ -479,6 +484,13 @@ void migrate_page_copy(struct page *newpage, struct page *page)
479 __set_page_dirty_nobuffers(newpage); 484 __set_page_dirty_nobuffers(newpage);
480 } 485 }
481 486
487 /*
488 * Copy NUMA information to the new page, to prevent over-eager
489 * future migrations of this same page.
490 */
491 cpupid = page_cpupid_xchg_last(page, -1);
492 page_cpupid_xchg_last(newpage, cpupid);
493
482 mlock_migrate_page(newpage, page); 494 mlock_migrate_page(newpage, page);
483 ksm_migrate_page(newpage, page); 495 ksm_migrate_page(newpage, page);
484 /* 496 /*
@@ -1498,7 +1510,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
1498 __GFP_NOWARN) & 1510 __GFP_NOWARN) &
1499 ~GFP_IOFS, 0); 1511 ~GFP_IOFS, 0);
1500 if (newpage) 1512 if (newpage)
1501 page_nid_xchg_last(newpage, page_nid_last(page)); 1513 page_cpupid_xchg_last(newpage, page_cpupid_last(page));
1502 1514
1503 return newpage; 1515 return newpage;
1504} 1516}
@@ -1599,7 +1611,8 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1599 * node. Caller is expected to have an elevated reference count on 1611 * node. Caller is expected to have an elevated reference count on
1600 * the page that will be dropped by this function before returning. 1612 * the page that will be dropped by this function before returning.
1601 */ 1613 */
1602int migrate_misplaced_page(struct page *page, int node) 1614int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
1615 int node)
1603{ 1616{
1604 pg_data_t *pgdat = NODE_DATA(node); 1617 pg_data_t *pgdat = NODE_DATA(node);
1605 int isolated; 1618 int isolated;
@@ -1607,10 +1620,11 @@ int migrate_misplaced_page(struct page *page, int node)
1607 LIST_HEAD(migratepages); 1620 LIST_HEAD(migratepages);
1608 1621
1609 /* 1622 /*
1610 * Don't migrate pages that are mapped in multiple processes. 1623 * Don't migrate file pages that are mapped in multiple processes
1611 * TODO: Handle false sharing detection instead of this hammer 1624 * with execute permissions as they are probably shared libraries.
1612 */ 1625 */
1613 if (page_mapcount(page) != 1) 1626 if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
1627 (vma->vm_flags & VM_EXEC))
1614 goto out; 1628 goto out;
1615 1629
1616 /* 1630 /*
@@ -1653,6 +1667,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1653 unsigned long address, 1667 unsigned long address,
1654 struct page *page, int node) 1668 struct page *page, int node)
1655{ 1669{
1670 spinlock_t *ptl;
1656 unsigned long haddr = address & HPAGE_PMD_MASK; 1671 unsigned long haddr = address & HPAGE_PMD_MASK;
1657 pg_data_t *pgdat = NODE_DATA(node); 1672 pg_data_t *pgdat = NODE_DATA(node);
1658 int isolated = 0; 1673 int isolated = 0;
@@ -1661,13 +1676,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1661 int page_lru = page_is_file_cache(page); 1676 int page_lru = page_is_file_cache(page);
1662 1677
1663 /* 1678 /*
1664 * Don't migrate pages that are mapped in multiple processes.
1665 * TODO: Handle false sharing detection instead of this hammer
1666 */
1667 if (page_mapcount(page) != 1)
1668 goto out_dropref;
1669
1670 /*
1671 * Rate-limit the amount of data that is being migrated to a node. 1679 * Rate-limit the amount of data that is being migrated to a node.
1672 * Optimal placement is no good if the memory bus is saturated and 1680 * Optimal placement is no good if the memory bus is saturated and
1673 * all the time is being spent migrating! 1681 * all the time is being spent migrating!
@@ -1680,7 +1688,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1680 if (!new_page) 1688 if (!new_page)
1681 goto out_fail; 1689 goto out_fail;
1682 1690
1683 page_nid_xchg_last(new_page, page_nid_last(page)); 1691 page_cpupid_xchg_last(new_page, page_cpupid_last(page));
1684 1692
1685 isolated = numamigrate_isolate_page(pgdat, page); 1693 isolated = numamigrate_isolate_page(pgdat, page);
1686 if (!isolated) { 1694 if (!isolated) {
@@ -1699,9 +1707,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1699 WARN_ON(PageLRU(new_page)); 1707 WARN_ON(PageLRU(new_page));
1700 1708
1701 /* Recheck the target PMD */ 1709 /* Recheck the target PMD */
1702 spin_lock(&mm->page_table_lock); 1710 ptl = pmd_lock(mm, pmd);
1703 if (unlikely(!pmd_same(*pmd, entry))) { 1711 if (unlikely(!pmd_same(*pmd, entry))) {
1704 spin_unlock(&mm->page_table_lock); 1712 spin_unlock(ptl);
1705 1713
1706 /* Reverse changes made by migrate_page_copy() */ 1714 /* Reverse changes made by migrate_page_copy() */
1707 if (TestClearPageActive(new_page)) 1715 if (TestClearPageActive(new_page))
@@ -1713,12 +1721,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1713 unlock_page(new_page); 1721 unlock_page(new_page);
1714 put_page(new_page); /* Free it */ 1722 put_page(new_page); /* Free it */
1715 1723
1716 unlock_page(page); 1724 /* Retake the callers reference and putback on LRU */
1725 get_page(page);
1717 putback_lru_page(page); 1726 putback_lru_page(page);
1718 1727 mod_zone_page_state(page_zone(page),
1719 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1728 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
1720 isolated = 0; 1729 goto out_fail;
1721 goto out;
1722 } 1730 }
1723 1731
1724 /* 1732 /*
@@ -1735,9 +1743,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1735 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1743 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1736 entry = pmd_mkhuge(entry); 1744 entry = pmd_mkhuge(entry);
1737 1745
1738 page_add_new_anon_rmap(new_page, vma, haddr); 1746 pmdp_clear_flush(vma, haddr, pmd);
1739
1740 set_pmd_at(mm, haddr, pmd, entry); 1747 set_pmd_at(mm, haddr, pmd, entry);
1748 page_add_new_anon_rmap(new_page, vma, haddr);
1741 update_mmu_cache_pmd(vma, address, &entry); 1749 update_mmu_cache_pmd(vma, address, &entry);
1742 page_remove_rmap(page); 1750 page_remove_rmap(page);
1743 /* 1751 /*
@@ -1746,7 +1754,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1746 * before it's fully transferred to the new page. 1754 * before it's fully transferred to the new page.
1747 */ 1755 */
1748 mem_cgroup_end_migration(memcg, page, new_page, true); 1756 mem_cgroup_end_migration(memcg, page, new_page, true);
1749 spin_unlock(&mm->page_table_lock); 1757 spin_unlock(ptl);
1750 1758
1751 unlock_page(new_page); 1759 unlock_page(new_page);
1752 unlock_page(page); 1760 unlock_page(page);
@@ -1756,7 +1764,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1756 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); 1764 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
1757 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); 1765 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
1758 1766
1759out:
1760 mod_zone_page_state(page_zone(page), 1767 mod_zone_page_state(page_zone(page),
1761 NR_ISOLATED_ANON + page_lru, 1768 NR_ISOLATED_ANON + page_lru,
1762 -HPAGE_PMD_NR); 1769 -HPAGE_PMD_NR);
@@ -1765,6 +1772,10 @@ out:
1765out_fail: 1772out_fail:
1766 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1773 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1767out_dropref: 1774out_dropref:
1775 entry = pmd_mknonnuma(entry);
1776 set_pmd_at(mm, haddr, pmd, entry);
1777 update_mmu_cache_pmd(vma, address, &entry);
1778
1768 unlock_page(page); 1779 unlock_page(page);
1769 put_page(page); 1780 put_page(page);
1770 return 0; 1781 return 0;
diff --git a/mm/mlock.c b/mm/mlock.c
index d63802663242..d480cd6fc475 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -379,10 +379,14 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
379 379
380 /* 380 /*
381 * Initialize pte walk starting at the already pinned page where we 381 * Initialize pte walk starting at the already pinned page where we
382 * are sure that there is a pte. 382 * are sure that there is a pte, as it was pinned under the same
383 * mmap_sem write op.
383 */ 384 */
384 pte = get_locked_pte(vma->vm_mm, start, &ptl); 385 pte = get_locked_pte(vma->vm_mm, start, &ptl);
385 end = min(end, pmd_addr_end(start, end)); 386 /* Make sure we do not cross the page table boundary */
387 end = pgd_addr_end(start, end);
388 end = pud_addr_end(start, end);
389 end = pmd_addr_end(start, end);
386 390
387 /* The page next to the pinned page is the first we will try to get */ 391 /* The page next to the pinned page is the first we will try to get */
388 start += PAGE_SIZE; 392 start += PAGE_SIZE;
@@ -736,6 +740,7 @@ static int do_mlockall(int flags)
736 740
737 /* Ignore errors */ 741 /* Ignore errors */
738 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); 742 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
743 cond_resched();
739 } 744 }
740out: 745out:
741 return 0; 746 return 0;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 633c08863fd8..68562e92d50c 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void)
71 unsigned long or_mask, add_mask; 71 unsigned long or_mask, add_mask;
72 72
73 shift = 8 * sizeof(unsigned long); 73 shift = 8 * sizeof(unsigned long);
74 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; 74 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT;
75 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", 75 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
76 "Section %d Node %d Zone %d Lastnid %d Flags %d\n", 76 "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n",
77 SECTIONS_WIDTH, 77 SECTIONS_WIDTH,
78 NODES_WIDTH, 78 NODES_WIDTH,
79 ZONES_WIDTH, 79 ZONES_WIDTH,
80 LAST_NID_WIDTH, 80 LAST_CPUPID_WIDTH,
81 NR_PAGEFLAGS); 81 NR_PAGEFLAGS);
82 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", 82 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
83 "Section %d Node %d Zone %d Lastnid %d\n", 83 "Section %d Node %d Zone %d Lastcpupid %d\n",
84 SECTIONS_SHIFT, 84 SECTIONS_SHIFT,
85 NODES_SHIFT, 85 NODES_SHIFT,
86 ZONES_SHIFT, 86 ZONES_SHIFT,
87 LAST_NID_SHIFT); 87 LAST_CPUPID_SHIFT);
88 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", 88 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
89 "Section %lu Node %lu Zone %lu Lastnid %lu\n", 89 "Section %lu Node %lu Zone %lu Lastcpupid %lu\n",
90 (unsigned long)SECTIONS_PGSHIFT, 90 (unsigned long)SECTIONS_PGSHIFT,
91 (unsigned long)NODES_PGSHIFT, 91 (unsigned long)NODES_PGSHIFT,
92 (unsigned long)ZONES_PGSHIFT, 92 (unsigned long)ZONES_PGSHIFT,
93 (unsigned long)LAST_NID_PGSHIFT); 93 (unsigned long)LAST_CPUPID_PGSHIFT);
94 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", 94 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
95 "Node/Zone ID: %lu -> %lu\n", 95 "Node/Zone ID: %lu -> %lu\n",
96 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), 96 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
@@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void)
102 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 102 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
103 "Node not in page flags"); 103 "Node not in page flags");
104#endif 104#endif
105#ifdef LAST_NID_NOT_IN_PAGE_FLAGS 105#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
106 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 106 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
107 "Last nid not in page flags"); 107 "Last cpupid not in page flags");
108#endif 108#endif
109 109
110 if (SECTIONS_WIDTH) { 110 if (SECTIONS_WIDTH) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d548512ff8a..834b2d785f1e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -179,14 +179,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
179 goto error; 179 goto error;
180 } 180 }
181 181
182 allowed = (totalram_pages - hugetlb_total_pages()) 182 allowed = vm_commit_limit();
183 * sysctl_overcommit_ratio / 100;
184 /* 183 /*
185 * Reserve some for root 184 * Reserve some for root
186 */ 185 */
187 if (!cap_sys_admin) 186 if (!cap_sys_admin)
188 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 187 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
189 allowed += total_swap_pages;
190 188
191 /* 189 /*
192 * Don't let a single process grow so big a user can't recover 190 * Don't let a single process grow so big a user can't recover
@@ -1299,7 +1297,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1299 vm_flags &= ~VM_MAYEXEC; 1297 vm_flags &= ~VM_MAYEXEC;
1300 } 1298 }
1301 1299
1302 if (!file->f_op || !file->f_op->mmap) 1300 if (!file->f_op->mmap)
1303 return -ENODEV; 1301 return -ENODEV;
1304 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 1302 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1305 return -EINVAL; 1303 return -EINVAL;
@@ -1856,7 +1854,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1856 struct vm_area_struct *vma; 1854 struct vm_area_struct *vma;
1857 struct vm_unmapped_area_info info; 1855 struct vm_unmapped_area_info info;
1858 1856
1859 if (len > TASK_SIZE) 1857 if (len > TASK_SIZE - mmap_min_addr)
1860 return -ENOMEM; 1858 return -ENOMEM;
1861 1859
1862 if (flags & MAP_FIXED) 1860 if (flags & MAP_FIXED)
@@ -1865,14 +1863,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1865 if (addr) { 1863 if (addr) {
1866 addr = PAGE_ALIGN(addr); 1864 addr = PAGE_ALIGN(addr);
1867 vma = find_vma(mm, addr); 1865 vma = find_vma(mm, addr);
1868 if (TASK_SIZE - len >= addr && 1866 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
1869 (!vma || addr + len <= vma->vm_start)) 1867 (!vma || addr + len <= vma->vm_start))
1870 return addr; 1868 return addr;
1871 } 1869 }
1872 1870
1873 info.flags = 0; 1871 info.flags = 0;
1874 info.length = len; 1872 info.length = len;
1875 info.low_limit = TASK_UNMAPPED_BASE; 1873 info.low_limit = mm->mmap_base;
1876 info.high_limit = TASK_SIZE; 1874 info.high_limit = TASK_SIZE;
1877 info.align_mask = 0; 1875 info.align_mask = 0;
1878 return vm_unmapped_area(&info); 1876 return vm_unmapped_area(&info);
@@ -1895,7 +1893,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1895 struct vm_unmapped_area_info info; 1893 struct vm_unmapped_area_info info;
1896 1894
1897 /* requested length too big for entire address space */ 1895 /* requested length too big for entire address space */
1898 if (len > TASK_SIZE) 1896 if (len > TASK_SIZE - mmap_min_addr)
1899 return -ENOMEM; 1897 return -ENOMEM;
1900 1898
1901 if (flags & MAP_FIXED) 1899 if (flags & MAP_FIXED)
@@ -1905,14 +1903,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1905 if (addr) { 1903 if (addr) {
1906 addr = PAGE_ALIGN(addr); 1904 addr = PAGE_ALIGN(addr);
1907 vma = find_vma(mm, addr); 1905 vma = find_vma(mm, addr);
1908 if (TASK_SIZE - len >= addr && 1906 if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
1909 (!vma || addr + len <= vma->vm_start)) 1907 (!vma || addr + len <= vma->vm_start))
1910 return addr; 1908 return addr;
1911 } 1909 }
1912 1910
1913 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 1911 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
1914 info.length = len; 1912 info.length = len;
1915 info.low_limit = PAGE_SIZE; 1913 info.low_limit = max(PAGE_SIZE, mmap_min_addr);
1916 info.high_limit = mm->mmap_base; 1914 info.high_limit = mm->mmap_base;
1917 info.align_mask = 0; 1915 info.align_mask = 0;
1918 addr = vm_unmapped_area(&info); 1916 addr = vm_unmapped_area(&info);
@@ -1951,7 +1949,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1951 return -ENOMEM; 1949 return -ENOMEM;
1952 1950
1953 get_area = current->mm->get_unmapped_area; 1951 get_area = current->mm->get_unmapped_area;
1954 if (file && file->f_op && file->f_op->get_unmapped_area) 1952 if (file && file->f_op->get_unmapped_area)
1955 get_area = file->f_op->get_unmapped_area; 1953 get_area = file->f_op->get_unmapped_area;
1956 addr = get_area(file, addr, len, pgoff, flags); 1954 addr = get_area(file, addr, len, pgoff, flags);
1957 if (IS_ERR_VALUE(addr)) 1955 if (IS_ERR_VALUE(addr))
@@ -2726,7 +2724,8 @@ void exit_mmap(struct mm_struct *mm)
2726 } 2724 }
2727 vm_unacct_memory(nr_accounted); 2725 vm_unacct_memory(nr_accounted);
2728 2726
2729 WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); 2727 WARN_ON(atomic_long_read(&mm->nr_ptes) >
2728 (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2730} 2729}
2731 2730
2732/* Insert vm structure into process list sorted by address 2731/* Insert vm structure into process list sorted by address
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 2ac0afbd68f3..bf34fb8556db 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec)
97 INIT_LIST_HEAD(&lruvec->lists[lru]); 97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98} 98}
99 99
100#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) 100#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
101int page_nid_xchg_last(struct page *page, int nid) 101int page_cpupid_xchg_last(struct page *page, int cpupid)
102{ 102{
103 unsigned long old_flags, flags; 103 unsigned long old_flags, flags;
104 int last_nid; 104 int last_cpupid;
105 105
106 do { 106 do {
107 old_flags = flags = page->flags; 107 old_flags = flags = page->flags;
108 last_nid = page_nid_last(page); 108 last_cpupid = page_cpupid_last(page);
109 109
110 flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); 110 flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
111 flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; 111 flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
112 } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); 112 } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
113 113
114 return last_nid; 114 return last_cpupid;
115} 115}
116#endif 116#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4d6b43..26667971c824 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -37,14 +37,12 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
37 37
38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
39 unsigned long addr, unsigned long end, pgprot_t newprot, 39 unsigned long addr, unsigned long end, pgprot_t newprot,
40 int dirty_accountable, int prot_numa, bool *ret_all_same_node) 40 int dirty_accountable, int prot_numa)
41{ 41{
42 struct mm_struct *mm = vma->vm_mm; 42 struct mm_struct *mm = vma->vm_mm;
43 pte_t *pte, oldpte; 43 pte_t *pte, oldpte;
44 spinlock_t *ptl; 44 spinlock_t *ptl;
45 unsigned long pages = 0; 45 unsigned long pages = 0;
46 bool all_same_node = true;
47 int last_nid = -1;
48 46
49 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 47 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
50 arch_enter_lazy_mmu_mode(); 48 arch_enter_lazy_mmu_mode();
@@ -63,15 +61,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
63 61
64 page = vm_normal_page(vma, addr, oldpte); 62 page = vm_normal_page(vma, addr, oldpte);
65 if (page) { 63 if (page) {
66 int this_nid = page_to_nid(page); 64 if (!pte_numa(oldpte)) {
67 if (last_nid == -1)
68 last_nid = this_nid;
69 if (last_nid != this_nid)
70 all_same_node = false;
71
72 /* only check non-shared pages */
73 if (!pte_numa(oldpte) &&
74 page_mapcount(page) == 1) {
75 ptent = pte_mknuma(ptent); 65 ptent = pte_mknuma(ptent);
76 updated = true; 66 updated = true;
77 } 67 }
@@ -94,40 +84,27 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
94 swp_entry_t entry = pte_to_swp_entry(oldpte); 84 swp_entry_t entry = pte_to_swp_entry(oldpte);
95 85
96 if (is_write_migration_entry(entry)) { 86 if (is_write_migration_entry(entry)) {
87 pte_t newpte;
97 /* 88 /*
98 * A protection check is difficult so 89 * A protection check is difficult so
99 * just be safe and disable write 90 * just be safe and disable write
100 */ 91 */
101 make_migration_entry_read(&entry); 92 make_migration_entry_read(&entry);
102 set_pte_at(mm, addr, pte, 93 newpte = swp_entry_to_pte(entry);
103 swp_entry_to_pte(entry)); 94 if (pte_swp_soft_dirty(oldpte))
95 newpte = pte_swp_mksoft_dirty(newpte);
96 set_pte_at(mm, addr, pte, newpte);
97
98 pages++;
104 } 99 }
105 pages++;
106 } 100 }
107 } while (pte++, addr += PAGE_SIZE, addr != end); 101 } while (pte++, addr += PAGE_SIZE, addr != end);
108 arch_leave_lazy_mmu_mode(); 102 arch_leave_lazy_mmu_mode();
109 pte_unmap_unlock(pte - 1, ptl); 103 pte_unmap_unlock(pte - 1, ptl);
110 104
111 *ret_all_same_node = all_same_node;
112 return pages; 105 return pages;
113} 106}
114 107
115#ifdef CONFIG_NUMA_BALANCING
116static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
117 pmd_t *pmd)
118{
119 spin_lock(&mm->page_table_lock);
120 set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
121 spin_unlock(&mm->page_table_lock);
122}
123#else
124static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
125 pmd_t *pmd)
126{
127 BUG();
128}
129#endif /* CONFIG_NUMA_BALANCING */
130
131static inline unsigned long change_pmd_range(struct vm_area_struct *vma, 108static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
132 pud_t *pud, unsigned long addr, unsigned long end, 109 pud_t *pud, unsigned long addr, unsigned long end,
133 pgprot_t newprot, int dirty_accountable, int prot_numa) 110 pgprot_t newprot, int dirty_accountable, int prot_numa)
@@ -135,36 +112,39 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
135 pmd_t *pmd; 112 pmd_t *pmd;
136 unsigned long next; 113 unsigned long next;
137 unsigned long pages = 0; 114 unsigned long pages = 0;
138 bool all_same_node; 115 unsigned long nr_huge_updates = 0;
139 116
140 pmd = pmd_offset(pud, addr); 117 pmd = pmd_offset(pud, addr);
141 do { 118 do {
119 unsigned long this_pages;
120
142 next = pmd_addr_end(addr, end); 121 next = pmd_addr_end(addr, end);
143 if (pmd_trans_huge(*pmd)) { 122 if (pmd_trans_huge(*pmd)) {
144 if (next - addr != HPAGE_PMD_SIZE) 123 if (next - addr != HPAGE_PMD_SIZE)
145 split_huge_page_pmd(vma, addr, pmd); 124 split_huge_page_pmd(vma, addr, pmd);
146 else if (change_huge_pmd(vma, pmd, addr, newprot, 125 else {
147 prot_numa)) { 126 int nr_ptes = change_huge_pmd(vma, pmd, addr,
148 pages += HPAGE_PMD_NR; 127 newprot, prot_numa);
149 continue; 128
129 if (nr_ptes) {
130 if (nr_ptes == HPAGE_PMD_NR) {
131 pages += HPAGE_PMD_NR;
132 nr_huge_updates++;
133 }
134 continue;
135 }
150 } 136 }
151 /* fall through */ 137 /* fall through */
152 } 138 }
153 if (pmd_none_or_clear_bad(pmd)) 139 if (pmd_none_or_clear_bad(pmd))
154 continue; 140 continue;
155 pages += change_pte_range(vma, pmd, addr, next, newprot, 141 this_pages = change_pte_range(vma, pmd, addr, next, newprot,
156 dirty_accountable, prot_numa, &all_same_node); 142 dirty_accountable, prot_numa);
157 143 pages += this_pages;
158 /*
159 * If we are changing protections for NUMA hinting faults then
160 * set pmd_numa if the examined pages were all on the same
161 * node. This allows a regular PMD to be handled as one fault
162 * and effectively batches the taking of the PTL
163 */
164 if (prot_numa && all_same_node)
165 change_pmd_protnuma(vma->vm_mm, addr, pmd);
166 } while (pmd++, addr = next, addr != end); 144 } while (pmd++, addr = next, addr != end);
167 145
146 if (nr_huge_updates)
147 count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
168 return pages; 148 return pages;
169} 149}
170 150
diff --git a/mm/mremap.c b/mm/mremap.c
index 91b13d6a16d4..0843feb66f3d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,7 +25,6 @@
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26#include <asm/cacheflush.h> 26#include <asm/cacheflush.h>
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28#include <asm/pgalloc.h>
29 28
30#include "internal.h" 29#include "internal.h"
31 30
@@ -63,10 +62,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
63 return NULL; 62 return NULL;
64 63
65 pmd = pmd_alloc(mm, pud, addr); 64 pmd = pmd_alloc(mm, pud, addr);
66 if (!pmd) { 65 if (!pmd)
67 pud_free(mm, pud);
68 return NULL; 66 return NULL;
69 }
70 67
71 VM_BUG_ON(pmd_trans_huge(*pmd)); 68 VM_BUG_ON(pmd_trans_huge(*pmd));
72 69
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 61107cf55bb3..2c254d374655 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -82,27 +82,18 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
82 82
83static void __init __free_pages_memory(unsigned long start, unsigned long end) 83static void __init __free_pages_memory(unsigned long start, unsigned long end)
84{ 84{
85 unsigned long i, start_aligned, end_aligned; 85 int order;
86 int order = ilog2(BITS_PER_LONG);
87 86
88 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); 87 while (start < end) {
89 end_aligned = end & ~(BITS_PER_LONG - 1); 88 order = min(MAX_ORDER - 1UL, __ffs(start));
90 89
91 if (end_aligned <= start_aligned) { 90 while (start + (1UL << order) > end)
92 for (i = start; i < end; i++) 91 order--;
93 __free_pages_bootmem(pfn_to_page(i), 0);
94 92
95 return; 93 __free_pages_bootmem(pfn_to_page(start), order);
96 }
97
98 for (i = start; i < start_aligned; i++)
99 __free_pages_bootmem(pfn_to_page(i), 0);
100 94
101 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) 95 start += (1UL << order);
102 __free_pages_bootmem(pfn_to_page(i), order); 96 }
103
104 for (i = end_aligned; i < end; i++)
105 __free_pages_bootmem(pfn_to_page(i), 0);
106} 97}
107 98
108static unsigned long __init __free_memory_core(phys_addr_t start, 99static unsigned long __init __free_memory_core(phys_addr_t start,
diff --git a/mm/nommu.c b/mm/nommu.c
index ecd1f158548e..fec093adad9c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -937,7 +937,7 @@ static int validate_mmap_request(struct file *file,
937 struct address_space *mapping; 937 struct address_space *mapping;
938 938
939 /* files must support mmap */ 939 /* files must support mmap */
940 if (!file->f_op || !file->f_op->mmap) 940 if (!file->f_op->mmap)
941 return -ENODEV; 941 return -ENODEV;
942 942
943 /* work out if what we've got could possibly be shared 943 /* work out if what we've got could possibly be shared
@@ -1948,13 +1948,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1948 goto error; 1948 goto error;
1949 } 1949 }
1950 1950
1951 allowed = totalram_pages * sysctl_overcommit_ratio / 100; 1951 allowed = vm_commit_limit();
1952 /* 1952 /*
1953 * Reserve some 3% for root 1953 * Reserve some 3% for root
1954 */ 1954 */
1955 if (!cap_sys_admin) 1955 if (!cap_sys_admin)
1956 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); 1956 allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
1957 allowed += total_swap_pages;
1958 1957
1959 /* 1958 /*
1960 * Don't let a single process grow so big a user can't recover 1959 * Don't let a single process grow so big a user can't recover
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 314e9d274381..1e4a600a6163 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
161 * The baseline for the badness score is the proportion of RAM that each 161 * The baseline for the badness score is the proportion of RAM that each
162 * task's rss, pagetable and swap space use. 162 * task's rss, pagetable and swap space use.
163 */ 163 */
164 points = get_mm_rss(p->mm) + p->mm->nr_ptes + 164 points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
165 get_mm_counter(p->mm, MM_SWAPENTS); 165 get_mm_counter(p->mm, MM_SWAPENTS);
166 task_unlock(p); 166 task_unlock(p);
167 167
@@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
364 continue; 364 continue;
365 } 365 }
366 366
367 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", 367 pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n",
368 task->pid, from_kuid(&init_user_ns, task_uid(task)), 368 task->pid, from_kuid(&init_user_ns, task_uid(task)),
369 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 369 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
370 task->mm->nr_ptes, 370 atomic_long_read(&task->mm->nr_ptes),
371 get_mm_counter(task->mm, MM_SWAPENTS), 371 get_mm_counter(task->mm, MM_SWAPENTS),
372 task->signal->oom_score_adj, task->comm); 372 task->signal->oom_score_adj, task->comm);
373 task_unlock(task); 373 task_unlock(task);
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void)
680{ 680{
681 struct zonelist *zonelist; 681 struct zonelist *zonelist;
682 682
683 if (mem_cgroup_oom_synchronize()) 683 if (mem_cgroup_oom_synchronize(true))
684 return; 684 return;
685 685
686 zonelist = node_zonelist(first_online_node, GFP_KERNEL); 686 zonelist = node_zonelist(first_online_node, GFP_KERNEL);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f5236f804aa6..63807583d8e8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1210,11 +1210,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
1210 return 1; 1210 return 1;
1211} 1211}
1212 1212
1213static long bdi_max_pause(struct backing_dev_info *bdi, 1213static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
1214 unsigned long bdi_dirty) 1214 unsigned long bdi_dirty)
1215{ 1215{
1216 long bw = bdi->avg_write_bandwidth; 1216 unsigned long bw = bdi->avg_write_bandwidth;
1217 long t; 1217 unsigned long t;
1218 1218
1219 /* 1219 /*
1220 * Limit pause time for small memory systems. If sleeping for too long 1220 * Limit pause time for small memory systems. If sleeping for too long
@@ -1226,7 +1226,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi,
1226 t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); 1226 t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
1227 t++; 1227 t++;
1228 1228
1229 return min_t(long, t, MAX_PAUSE); 1229 return min_t(unsigned long, t, MAX_PAUSE);
1230} 1230}
1231 1231
1232static long bdi_min_pause(struct backing_dev_info *bdi, 1232static long bdi_min_pause(struct backing_dev_info *bdi,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0ee638f76ebe..580a5f075ed0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -234,8 +234,8 @@ int page_group_by_mobility_disabled __read_mostly;
234 234
235void set_pageblock_migratetype(struct page *page, int migratetype) 235void set_pageblock_migratetype(struct page *page, int migratetype)
236{ 236{
237 237 if (unlikely(page_group_by_mobility_disabled &&
238 if (unlikely(page_group_by_mobility_disabled)) 238 migratetype < MIGRATE_PCPTYPES))
239 migratetype = MIGRATE_UNMOVABLE; 239 migratetype = MIGRATE_UNMOVABLE;
240 240
241 set_pageblock_flags_group(page, (unsigned long)migratetype, 241 set_pageblock_flags_group(page, (unsigned long)migratetype,
@@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page)
626 bad_page(page); 626 bad_page(page);
627 return 1; 627 return 1;
628 } 628 }
629 page_nid_reset_last(page); 629 page_cpupid_reset_last(page);
630 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 630 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
631 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 631 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
632 return 0; 632 return 0;
@@ -1027,6 +1027,10 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
1027{ 1027{
1028 int current_order = page_order(page); 1028 int current_order = page_order(page);
1029 1029
1030 /*
1031 * When borrowing from MIGRATE_CMA, we need to release the excess
1032 * buddy pages to CMA itself.
1033 */
1030 if (is_migrate_cma(fallback_type)) 1034 if (is_migrate_cma(fallback_type))
1031 return fallback_type; 1035 return fallback_type;
1032 1036
@@ -1091,21 +1095,11 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1091 list_del(&page->lru); 1095 list_del(&page->lru);
1092 rmv_page_order(page); 1096 rmv_page_order(page);
1093 1097
1094 /*
1095 * Borrow the excess buddy pages as well, irrespective
1096 * of whether we stole freepages, or took ownership of
1097 * the pageblock or not.
1098 *
1099 * Exception: When borrowing from MIGRATE_CMA, release
1100 * the excess buddy pages to CMA itself.
1101 */
1102 expand(zone, page, order, current_order, area, 1098 expand(zone, page, order, current_order, area,
1103 is_migrate_cma(migratetype) 1099 new_type);
1104 ? migratetype : start_migratetype);
1105 1100
1106 trace_mm_page_alloc_extfrag(page, order, 1101 trace_mm_page_alloc_extfrag(page, order, current_order,
1107 current_order, start_migratetype, migratetype, 1102 start_migratetype, migratetype, new_type);
1108 new_type == start_migratetype);
1109 1103
1110 return page; 1104 return page;
1111 } 1105 }
@@ -1711,7 +1705,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1711 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1705 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1712 * that have to skip over a lot of full or unallowed zones. 1706 * that have to skip over a lot of full or unallowed zones.
1713 * 1707 *
1714 * If the zonelist cache is present in the passed in zonelist, then 1708 * If the zonelist cache is present in the passed zonelist, then
1715 * returns a pointer to the allowed node mask (either the current 1709 * returns a pointer to the allowed node mask (either the current
1716 * tasks mems_allowed, or node_states[N_MEMORY].) 1710 * tasks mems_allowed, or node_states[N_MEMORY].)
1717 * 1711 *
@@ -2593,7 +2587,7 @@ rebalance:
2593 * running out of options and have to consider going OOM 2587 * running out of options and have to consider going OOM
2594 */ 2588 */
2595 if (!did_some_progress) { 2589 if (!did_some_progress) {
2596 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 2590 if (oom_gfp_allowed(gfp_mask)) {
2597 if (oom_killer_disabled) 2591 if (oom_killer_disabled)
2598 goto nopage; 2592 goto nopage;
2599 /* Coredumps can quickly deplete all memory reserves */ 2593 /* Coredumps can quickly deplete all memory reserves */
@@ -3881,8 +3875,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
3881 return ffz(~size); 3875 return ffz(~size);
3882} 3876}
3883 3877
3884#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3885
3886/* 3878/*
3887 * Check if a pageblock contains reserved pages 3879 * Check if a pageblock contains reserved pages
3888 */ 3880 */
@@ -4015,7 +4007,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
4015 mminit_verify_page_links(page, zone, nid, pfn); 4007 mminit_verify_page_links(page, zone, nid, pfn);
4016 init_page_count(page); 4008 init_page_count(page);
4017 page_mapcount_reset(page); 4009 page_mapcount_reset(page);
4018 page_nid_reset_last(page); 4010 page_cpupid_reset_last(page);
4019 SetPageReserved(page); 4011 SetPageReserved(page);
4020 /* 4012 /*
4021 * Mark the block movable so that blocks are reserved for 4013 * Mark the block movable so that blocks are reserved for
@@ -4266,7 +4258,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
4266 */ 4258 */
4267 zone->pageset = &boot_pageset; 4259 zone->pageset = &boot_pageset;
4268 4260
4269 if (zone->present_pages) 4261 if (populated_zone(zone))
4270 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", 4262 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
4271 zone->name, zone->present_pages, 4263 zone->name, zone->present_pages,
4272 zone_batchsize(zone)); 4264 zone_batchsize(zone));
@@ -5160,7 +5152,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
5160 5152
5161 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { 5153 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
5162 struct zone *zone = &pgdat->node_zones[zone_type]; 5154 struct zone *zone = &pgdat->node_zones[zone_type];
5163 if (zone->present_pages) { 5155 if (populated_zone(zone)) {
5164 node_set_state(nid, N_HIGH_MEMORY); 5156 node_set_state(nid, N_HIGH_MEMORY);
5165 if (N_NORMAL_MEMORY != N_HIGH_MEMORY && 5157 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
5166 zone_type <= ZONE_NORMAL) 5158 zone_type <= ZONE_NORMAL)
@@ -6366,10 +6358,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6366 list_del(&page->lru); 6358 list_del(&page->lru);
6367 rmv_page_order(page); 6359 rmv_page_order(page);
6368 zone->free_area[order].nr_free--; 6360 zone->free_area[order].nr_free--;
6369#ifdef CONFIG_HIGHMEM
6370 if (PageHighMem(page))
6371 totalhigh_pages -= 1 << order;
6372#endif
6373 for (i = 0; i < (1 << order); i++) 6361 for (i = 0; i < (1 << order); i++)
6374 SetPageReserved((page+i)); 6362 SetPageReserved((page+i));
6375 pfn += (1 << order); 6363 pfn += (1 << order);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 5da2cbcfdbb5..2beeabf502c5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -242,7 +242,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
242 if (err) 242 if (err)
243 break; 243 break;
244 pgd++; 244 pgd++;
245 } while (addr = next, addr != end); 245 } while (addr = next, addr < end);
246 246
247 return err; 247 return err;
248} 248}
diff --git a/mm/percpu.c b/mm/percpu.c
index 8c8e08f3a692..0d10defe951e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1706,8 +1706,9 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
1706 1706
1707out_free_areas: 1707out_free_areas:
1708 for (group = 0; group < ai->nr_groups; group++) 1708 for (group = 0; group < ai->nr_groups; group++)
1709 free_fn(areas[group], 1709 if (areas[group])
1710 ai->groups[group].nr_units * ai->unit_size); 1710 free_fn(areas[group],
1711 ai->groups[group].nr_units * ai->unit_size);
1711out_free: 1712out_free:
1712 pcpu_free_alloc_info(ai); 1713 pcpu_free_alloc_info(ai);
1713 if (areas) 1714 if (areas)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 3929a40bd6c0..cbb38545d9d6 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -151,14 +151,14 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
151void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 151void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
152 pgtable_t pgtable) 152 pgtable_t pgtable)
153{ 153{
154 assert_spin_locked(&mm->page_table_lock); 154 assert_spin_locked(pmd_lockptr(mm, pmdp));
155 155
156 /* FIFO */ 156 /* FIFO */
157 if (!mm->pmd_huge_pte) 157 if (!pmd_huge_pte(mm, pmdp))
158 INIT_LIST_HEAD(&pgtable->lru); 158 INIT_LIST_HEAD(&pgtable->lru);
159 else 159 else
160 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); 160 list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
161 mm->pmd_huge_pte = pgtable; 161 pmd_huge_pte(mm, pmdp) = pgtable;
162} 162}
163#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 163#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
164#endif 164#endif
@@ -170,14 +170,14 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
170{ 170{
171 pgtable_t pgtable; 171 pgtable_t pgtable;
172 172
173 assert_spin_locked(&mm->page_table_lock); 173 assert_spin_locked(pmd_lockptr(mm, pmdp));
174 174
175 /* FIFO */ 175 /* FIFO */
176 pgtable = mm->pmd_huge_pte; 176 pgtable = pmd_huge_pte(mm, pmdp);
177 if (list_empty(&pgtable->lru)) 177 if (list_empty(&pgtable->lru))
178 mm->pmd_huge_pte = NULL; 178 pmd_huge_pte(mm, pmdp) = NULL;
179 else { 179 else {
180 mm->pmd_huge_pte = list_entry(pgtable->lru.next, 180 pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
181 struct page, lru); 181 struct page, lru);
182 list_del(&pgtable->lru); 182 list_del(&pgtable->lru);
183 } 183 }
diff --git a/mm/readahead.c b/mm/readahead.c
index e4ed04149785..7cdbb44aa90b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -401,6 +401,7 @@ ondemand_readahead(struct address_space *mapping,
401 unsigned long req_size) 401 unsigned long req_size)
402{ 402{
403 unsigned long max = max_sane_readahead(ra->ra_pages); 403 unsigned long max = max_sane_readahead(ra->ra_pages);
404 pgoff_t prev_offset;
404 405
405 /* 406 /*
406 * start of file 407 * start of file
@@ -452,8 +453,11 @@ ondemand_readahead(struct address_space *mapping,
452 453
453 /* 454 /*
454 * sequential cache miss 455 * sequential cache miss
456 * trivial case: (offset - prev_offset) == 1
457 * unaligned reads: (offset - prev_offset) == 0
455 */ 458 */
456 if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) 459 prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
460 if (offset - prev_offset <= 1UL)
457 goto initial_readahead; 461 goto initial_readahead;
458 462
459 /* 463 /*
@@ -569,7 +573,7 @@ static ssize_t
569do_readahead(struct address_space *mapping, struct file *filp, 573do_readahead(struct address_space *mapping, struct file *filp,
570 pgoff_t index, unsigned long nr) 574 pgoff_t index, unsigned long nr)
571{ 575{
572 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) 576 if (!mapping || !mapping->a_ops)
573 return -EINVAL; 577 return -EINVAL;
574 578
575 force_page_cache_readahead(mapping, filp, index, nr); 579 force_page_cache_readahead(mapping, filp, index, nr);
diff --git a/mm/rmap.c b/mm/rmap.c
index fd3ee7a54a13..55c8b8dc9ffb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -601,7 +601,7 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
601 601
602 if (unlikely(PageHuge(page))) { 602 if (unlikely(PageHuge(page))) {
603 pte = huge_pte_offset(mm, address); 603 pte = huge_pte_offset(mm, address);
604 ptl = &mm->page_table_lock; 604 ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
605 goto check; 605 goto check;
606 } 606 }
607 607
@@ -665,25 +665,23 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
665 unsigned long *vm_flags) 665 unsigned long *vm_flags)
666{ 666{
667 struct mm_struct *mm = vma->vm_mm; 667 struct mm_struct *mm = vma->vm_mm;
668 spinlock_t *ptl;
668 int referenced = 0; 669 int referenced = 0;
669 670
670 if (unlikely(PageTransHuge(page))) { 671 if (unlikely(PageTransHuge(page))) {
671 pmd_t *pmd; 672 pmd_t *pmd;
672 673
673 spin_lock(&mm->page_table_lock);
674 /* 674 /*
675 * rmap might return false positives; we must filter 675 * rmap might return false positives; we must filter
676 * these out using page_check_address_pmd(). 676 * these out using page_check_address_pmd().
677 */ 677 */
678 pmd = page_check_address_pmd(page, mm, address, 678 pmd = page_check_address_pmd(page, mm, address,
679 PAGE_CHECK_ADDRESS_PMD_FLAG); 679 PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
680 if (!pmd) { 680 if (!pmd)
681 spin_unlock(&mm->page_table_lock);
682 goto out; 681 goto out;
683 }
684 682
685 if (vma->vm_flags & VM_LOCKED) { 683 if (vma->vm_flags & VM_LOCKED) {
686 spin_unlock(&mm->page_table_lock); 684 spin_unlock(ptl);
687 *mapcount = 0; /* break early from loop */ 685 *mapcount = 0; /* break early from loop */
688 *vm_flags |= VM_LOCKED; 686 *vm_flags |= VM_LOCKED;
689 goto out; 687 goto out;
@@ -692,10 +690,9 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
692 /* go ahead even if the pmd is pmd_trans_splitting() */ 690 /* go ahead even if the pmd is pmd_trans_splitting() */
693 if (pmdp_clear_flush_young_notify(vma, address, pmd)) 691 if (pmdp_clear_flush_young_notify(vma, address, pmd))
694 referenced++; 692 referenced++;
695 spin_unlock(&mm->page_table_lock); 693 spin_unlock(ptl);
696 } else { 694 } else {
697 pte_t *pte; 695 pte_t *pte;
698 spinlock_t *ptl;
699 696
700 /* 697 /*
701 * rmap might return false positives; we must filter 698 * rmap might return false positives; we must filter
diff --git a/mm/slab.c b/mm/slab.c
index 2580db062df9..0c8967bb2018 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3982,7 +3982,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3982 3982
3983 VM_BUG_ON(!mutex_is_locked(&slab_mutex)); 3983 VM_BUG_ON(!mutex_is_locked(&slab_mutex));
3984 for_each_memcg_cache_index(i) { 3984 for_each_memcg_cache_index(i) {
3985 c = cache_from_memcg(cachep, i); 3985 c = cache_from_memcg_idx(cachep, i);
3986 if (c) 3986 if (c)
3987 /* return value determined by the parent cache only */ 3987 /* return value determined by the parent cache only */
3988 __do_tune_cpucache(c, limit, batchcount, shared, gfp); 3988 __do_tune_cpucache(c, limit, batchcount, shared, gfp);
diff --git a/mm/slab.h b/mm/slab.h
index a535033f7e9a..0859c4241ba1 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -160,7 +160,8 @@ static inline const char *cache_name(struct kmem_cache *s)
160 return s->name; 160 return s->name;
161} 161}
162 162
163static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) 163static inline struct kmem_cache *
164cache_from_memcg_idx(struct kmem_cache *s, int idx)
164{ 165{
165 if (!s->memcg_params) 166 if (!s->memcg_params)
166 return NULL; 167 return NULL;
@@ -204,7 +205,8 @@ static inline const char *cache_name(struct kmem_cache *s)
204 return s->name; 205 return s->name;
205} 206}
206 207
207static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) 208static inline struct kmem_cache *
209cache_from_memcg_idx(struct kmem_cache *s, int idx)
208{ 210{
209 return NULL; 211 return NULL;
210} 212}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a3443278ce3a..0b7bb399b0e4 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -56,6 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
56 continue; 56 continue;
57 } 57 }
58 58
59#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
59 /* 60 /*
60 * For simplicity, we won't check this in the list of memcg 61 * For simplicity, we won't check this in the list of memcg
61 * caches. We have control over memcg naming, and if there 62 * caches. We have control over memcg naming, and if there
@@ -69,6 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
69 s = NULL; 70 s = NULL;
70 return -EINVAL; 71 return -EINVAL;
71 } 72 }
73#endif
72 } 74 }
73 75
74 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 76 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
@@ -569,7 +571,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
569 return; 571 return;
570 572
571 for_each_memcg_cache_index(i) { 573 for_each_memcg_cache_index(i) {
572 c = cache_from_memcg(s, i); 574 c = cache_from_memcg_idx(s, i);
573 if (!c) 575 if (!c)
574 continue; 576 continue;
575 577
diff --git a/mm/slub.c b/mm/slub.c
index c3eb3d3ca835..7e8bd8d828bc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -955,7 +955,7 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
955 kmemleak_free_recursive(x, s->flags); 955 kmemleak_free_recursive(x, s->flags);
956 956
957 /* 957 /*
958 * Trouble is that we may no longer disable interupts in the fast path 958 * Trouble is that we may no longer disable interrupts in the fast path
959 * So in order to make the debug calls that expect irqs to be 959 * So in order to make the debug calls that expect irqs to be
960 * disabled we need to disable interrupts temporarily. 960 * disabled we need to disable interrupts temporarily.
961 */ 961 */
@@ -4983,7 +4983,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
4983 * through the descendants with best-effort propagation. 4983 * through the descendants with best-effort propagation.
4984 */ 4984 */
4985 for_each_memcg_cache_index(i) { 4985 for_each_memcg_cache_index(i) {
4986 struct kmem_cache *c = cache_from_memcg(s, i); 4986 struct kmem_cache *c = cache_from_memcg_idx(s, i);
4987 if (c) 4987 if (c)
4988 attribute->store(c, buf, len); 4988 attribute->store(c, buf, len);
4989 } 4989 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 4ac1d7ef548f..8cc7be0e9590 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -590,33 +590,32 @@ void __init sparse_init(void)
590 590
591#ifdef CONFIG_MEMORY_HOTPLUG 591#ifdef CONFIG_MEMORY_HOTPLUG
592#ifdef CONFIG_SPARSEMEM_VMEMMAP 592#ifdef CONFIG_SPARSEMEM_VMEMMAP
593static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, 593static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
594 unsigned long nr_pages)
595{ 594{
596 /* This will make the necessary allocations eventually. */ 595 /* This will make the necessary allocations eventually. */
597 return sparse_mem_map_populate(pnum, nid); 596 return sparse_mem_map_populate(pnum, nid);
598} 597}
599static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) 598static void __kfree_section_memmap(struct page *memmap)
600{ 599{
601 unsigned long start = (unsigned long)memmap; 600 unsigned long start = (unsigned long)memmap;
602 unsigned long end = (unsigned long)(memmap + nr_pages); 601 unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
603 602
604 vmemmap_free(start, end); 603 vmemmap_free(start, end);
605} 604}
606#ifdef CONFIG_MEMORY_HOTREMOVE 605#ifdef CONFIG_MEMORY_HOTREMOVE
607static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) 606static void free_map_bootmem(struct page *memmap)
608{ 607{
609 unsigned long start = (unsigned long)memmap; 608 unsigned long start = (unsigned long)memmap;
610 unsigned long end = (unsigned long)(memmap + nr_pages); 609 unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
611 610
612 vmemmap_free(start, end); 611 vmemmap_free(start, end);
613} 612}
614#endif /* CONFIG_MEMORY_HOTREMOVE */ 613#endif /* CONFIG_MEMORY_HOTREMOVE */
615#else 614#else
616static struct page *__kmalloc_section_memmap(unsigned long nr_pages) 615static struct page *__kmalloc_section_memmap(void)
617{ 616{
618 struct page *page, *ret; 617 struct page *page, *ret;
619 unsigned long memmap_size = sizeof(struct page) * nr_pages; 618 unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
620 619
621 page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); 620 page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
622 if (page) 621 if (page)
@@ -634,28 +633,30 @@ got_map_ptr:
634 return ret; 633 return ret;
635} 634}
636 635
637static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, 636static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
638 unsigned long nr_pages)
639{ 637{
640 return __kmalloc_section_memmap(nr_pages); 638 return __kmalloc_section_memmap();
641} 639}
642 640
643static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) 641static void __kfree_section_memmap(struct page *memmap)
644{ 642{
645 if (is_vmalloc_addr(memmap)) 643 if (is_vmalloc_addr(memmap))
646 vfree(memmap); 644 vfree(memmap);
647 else 645 else
648 free_pages((unsigned long)memmap, 646 free_pages((unsigned long)memmap,
649 get_order(sizeof(struct page) * nr_pages)); 647 get_order(sizeof(struct page) * PAGES_PER_SECTION));
650} 648}
651 649
652#ifdef CONFIG_MEMORY_HOTREMOVE 650#ifdef CONFIG_MEMORY_HOTREMOVE
653static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) 651static void free_map_bootmem(struct page *memmap)
654{ 652{
655 unsigned long maps_section_nr, removing_section_nr, i; 653 unsigned long maps_section_nr, removing_section_nr, i;
656 unsigned long magic; 654 unsigned long magic, nr_pages;
657 struct page *page = virt_to_page(memmap); 655 struct page *page = virt_to_page(memmap);
658 656
657 nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
658 >> PAGE_SHIFT;
659
659 for (i = 0; i < nr_pages; i++, page++) { 660 for (i = 0; i < nr_pages; i++, page++) {
660 magic = (unsigned long) page->lru.next; 661 magic = (unsigned long) page->lru.next;
661 662
@@ -684,8 +685,7 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
684 * set. If this is <=0, then that means that the passed-in 685 * set. If this is <=0, then that means that the passed-in
685 * map was not consumed and must be freed. 686 * map was not consumed and must be freed.
686 */ 687 */
687int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, 688int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
688 int nr_pages)
689{ 689{
690 unsigned long section_nr = pfn_to_section_nr(start_pfn); 690 unsigned long section_nr = pfn_to_section_nr(start_pfn);
691 struct pglist_data *pgdat = zone->zone_pgdat; 691 struct pglist_data *pgdat = zone->zone_pgdat;
@@ -702,12 +702,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
702 ret = sparse_index_init(section_nr, pgdat->node_id); 702 ret = sparse_index_init(section_nr, pgdat->node_id);
703 if (ret < 0 && ret != -EEXIST) 703 if (ret < 0 && ret != -EEXIST)
704 return ret; 704 return ret;
705 memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); 705 memmap = kmalloc_section_memmap(section_nr, pgdat->node_id);
706 if (!memmap) 706 if (!memmap)
707 return -ENOMEM; 707 return -ENOMEM;
708 usemap = __kmalloc_section_usemap(); 708 usemap = __kmalloc_section_usemap();
709 if (!usemap) { 709 if (!usemap) {
710 __kfree_section_memmap(memmap, nr_pages); 710 __kfree_section_memmap(memmap);
711 return -ENOMEM; 711 return -ENOMEM;
712 } 712 }
713 713
@@ -719,7 +719,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
719 goto out; 719 goto out;
720 } 720 }
721 721
722 memset(memmap, 0, sizeof(struct page) * nr_pages); 722 memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
723 723
724 ms->section_mem_map |= SECTION_MARKED_PRESENT; 724 ms->section_mem_map |= SECTION_MARKED_PRESENT;
725 725
@@ -729,7 +729,7 @@ out:
729 pgdat_resize_unlock(pgdat, &flags); 729 pgdat_resize_unlock(pgdat, &flags);
730 if (ret <= 0) { 730 if (ret <= 0) {
731 kfree(usemap); 731 kfree(usemap);
732 __kfree_section_memmap(memmap, nr_pages); 732 __kfree_section_memmap(memmap);
733 } 733 }
734 return ret; 734 return ret;
735} 735}
@@ -759,7 +759,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
759static void free_section_usemap(struct page *memmap, unsigned long *usemap) 759static void free_section_usemap(struct page *memmap, unsigned long *usemap)
760{ 760{
761 struct page *usemap_page; 761 struct page *usemap_page;
762 unsigned long nr_pages;
763 762
764 if (!usemap) 763 if (!usemap)
765 return; 764 return;
@@ -771,7 +770,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
771 if (PageSlab(usemap_page) || PageCompound(usemap_page)) { 770 if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
772 kfree(usemap); 771 kfree(usemap);
773 if (memmap) 772 if (memmap)
774 __kfree_section_memmap(memmap, PAGES_PER_SECTION); 773 __kfree_section_memmap(memmap);
775 return; 774 return;
776 } 775 }
777 776
@@ -780,12 +779,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
780 * on the section which has pgdat at boot time. Just keep it as is now. 779 * on the section which has pgdat at boot time. Just keep it as is now.
781 */ 780 */
782 781
783 if (memmap) { 782 if (memmap)
784 nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) 783 free_map_bootmem(memmap);
785 >> PAGE_SHIFT;
786
787 free_map_bootmem(memmap, nr_pages);
788 }
789} 784}
790 785
791void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) 786void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
diff --git a/mm/swap.c b/mm/swap.c
index 759c3caf44bd..7a9f80d451f5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -934,7 +934,8 @@ void __init swap_setup(void)
934#ifdef CONFIG_SWAP 934#ifdef CONFIG_SWAP
935 int i; 935 int i;
936 936
937 bdi_init(swapper_spaces[0].backing_dev_info); 937 if (bdi_init(swapper_spaces[0].backing_dev_info))
938 panic("Failed to init swap bdi");
938 for (i = 0; i < MAX_SWAPFILES; i++) { 939 for (i = 0; i < MAX_SWAPFILES; i++) {
939 spin_lock_init(&swapper_spaces[i].tree_lock); 940 spin_lock_init(&swapper_spaces[i].tree_lock);
940 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); 941 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 3963fc24fcc1..612a7c9795f6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -707,7 +707,7 @@ noswap:
707 return (swp_entry_t) {0}; 707 return (swp_entry_t) {0};
708} 708}
709 709
710/* The only caller of this function is now susupend routine */ 710/* The only caller of this function is now suspend routine */
711swp_entry_t get_swap_page_of_type(int type) 711swp_entry_t get_swap_page_of_type(int type)
712{ 712{
713 struct swap_info_struct *si; 713 struct swap_info_struct *si;
@@ -845,7 +845,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
845} 845}
846 846
847/* 847/*
848 * Caller has made sure that the swapdevice corresponding to entry 848 * Caller has made sure that the swap device corresponding to entry
849 * is still around or has not been recycled. 849 * is still around or has not been recycled.
850 */ 850 */
851void swap_free(swp_entry_t entry) 851void swap_free(swp_entry_t entry)
@@ -947,7 +947,7 @@ int try_to_free_swap(struct page *page)
947 * original page might be freed under memory pressure, then 947 * original page might be freed under memory pressure, then
948 * later read back in from swap, now with the wrong data. 948 * later read back in from swap, now with the wrong data.
949 * 949 *
950 * Hibration suspends storage while it is writing the image 950 * Hibernation suspends storage while it is writing the image
951 * to disk so check that here. 951 * to disk so check that here.
952 */ 952 */
953 if (pm_suspended_storage()) 953 if (pm_suspended_storage())
@@ -1179,7 +1179,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1179 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse 1179 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
1180 * of unmatched parts which look like swp_pte, so unuse_pte must 1180 * of unmatched parts which look like swp_pte, so unuse_pte must
1181 * recheck under pte lock. Scanning without pte lock lets it be 1181 * recheck under pte lock. Scanning without pte lock lets it be
1182 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. 1182 * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
1183 */ 1183 */
1184 pte = pte_offset_map(pmd, addr); 1184 pte = pte_offset_map(pmd, addr);
1185 do { 1185 do {
@@ -1824,6 +1824,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1824 struct filename *pathname; 1824 struct filename *pathname;
1825 int i, type, prev; 1825 int i, type, prev;
1826 int err; 1826 int err;
1827 unsigned int old_block_size;
1827 1828
1828 if (!capable(CAP_SYS_ADMIN)) 1829 if (!capable(CAP_SYS_ADMIN))
1829 return -EPERM; 1830 return -EPERM;
@@ -1914,6 +1915,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1914 } 1915 }
1915 1916
1916 swap_file = p->swap_file; 1917 swap_file = p->swap_file;
1918 old_block_size = p->old_block_size;
1917 p->swap_file = NULL; 1919 p->swap_file = NULL;
1918 p->max = 0; 1920 p->max = 0;
1919 swap_map = p->swap_map; 1921 swap_map = p->swap_map;
@@ -1922,23 +1924,23 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1922 p->cluster_info = NULL; 1924 p->cluster_info = NULL;
1923 p->flags = 0; 1925 p->flags = 0;
1924 frontswap_map = frontswap_map_get(p); 1926 frontswap_map = frontswap_map_get(p);
1925 frontswap_map_set(p, NULL);
1926 spin_unlock(&p->lock); 1927 spin_unlock(&p->lock);
1927 spin_unlock(&swap_lock); 1928 spin_unlock(&swap_lock);
1928 frontswap_invalidate_area(type); 1929 frontswap_invalidate_area(type);
1930 frontswap_map_set(p, NULL);
1929 mutex_unlock(&swapon_mutex); 1931 mutex_unlock(&swapon_mutex);
1930 free_percpu(p->percpu_cluster); 1932 free_percpu(p->percpu_cluster);
1931 p->percpu_cluster = NULL; 1933 p->percpu_cluster = NULL;
1932 vfree(swap_map); 1934 vfree(swap_map);
1933 vfree(cluster_info); 1935 vfree(cluster_info);
1934 vfree(frontswap_map); 1936 vfree(frontswap_map);
1935 /* Destroy swap account informatin */ 1937 /* Destroy swap account information */
1936 swap_cgroup_swapoff(type); 1938 swap_cgroup_swapoff(type);
1937 1939
1938 inode = mapping->host; 1940 inode = mapping->host;
1939 if (S_ISBLK(inode->i_mode)) { 1941 if (S_ISBLK(inode->i_mode)) {
1940 struct block_device *bdev = I_BDEV(inode); 1942 struct block_device *bdev = I_BDEV(inode);
1941 set_blocksize(bdev, p->old_block_size); 1943 set_blocksize(bdev, old_block_size);
1942 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 1944 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1943 } else { 1945 } else {
1944 mutex_lock(&inode->i_mutex); 1946 mutex_lock(&inode->i_mutex);
@@ -2784,8 +2786,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2784 2786
2785 /* 2787 /*
2786 * We are fortunate that although vmalloc_to_page uses pte_offset_map, 2788 * We are fortunate that although vmalloc_to_page uses pte_offset_map,
2787 * no architecture is using highmem pages for kernel pagetables: so it 2789 * no architecture is using highmem pages for kernel page tables: so it
2788 * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. 2790 * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
2789 */ 2791 */
2790 head = vmalloc_to_page(si->swap_map + offset); 2792 head = vmalloc_to_page(si->swap_map + offset);
2791 offset &= ~PAGE_MASK; 2793 offset &= ~PAGE_MASK;
diff --git a/mm/util.c b/mm/util.c
index eaf63fc2c92f..f7bc2096071c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,6 +7,9 @@
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/swap.h> 8#include <linux/swap.h>
9#include <linux/swapops.h> 9#include <linux/swapops.h>
10#include <linux/mman.h>
11#include <linux/hugetlb.h>
12
10#include <asm/uaccess.h> 13#include <asm/uaccess.h>
11 14
12#include "internal.h" 15#include "internal.h"
@@ -398,6 +401,16 @@ struct address_space *page_mapping(struct page *page)
398 return mapping; 401 return mapping;
399} 402}
400 403
404/*
405 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
406 */
407unsigned long vm_commit_limit(void)
408{
409 return ((totalram_pages - hugetlb_total_pages())
410 * sysctl_overcommit_ratio / 100) + total_swap_pages;
411}
412
413
401/* Tracepoints definitions. */ 414/* Tracepoints definitions. */
402EXPORT_TRACEPOINT_SYMBOL(kmalloc); 415EXPORT_TRACEPOINT_SYMBOL(kmalloc);
403EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 416EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 107454312d5e..0fdf96803c5b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -359,6 +359,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
359 if (unlikely(!va)) 359 if (unlikely(!va))
360 return ERR_PTR(-ENOMEM); 360 return ERR_PTR(-ENOMEM);
361 361
362 /*
363 * Only scan the relevant parts containing pointers to other objects
364 * to avoid false negatives.
365 */
366 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
367
362retry: 368retry:
363 spin_lock(&vmap_area_lock); 369 spin_lock(&vmap_area_lock);
364 /* 370 /*
@@ -1546,7 +1552,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1546 gfp_t gfp_mask, pgprot_t prot, 1552 gfp_t gfp_mask, pgprot_t prot,
1547 int node, const void *caller); 1553 int node, const void *caller);
1548static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1554static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1549 pgprot_t prot, int node, const void *caller) 1555 pgprot_t prot, int node)
1550{ 1556{
1551 const int order = 0; 1557 const int order = 0;
1552 struct page **pages; 1558 struct page **pages;
@@ -1560,13 +1566,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1560 /* Please note that the recursion is strictly bounded. */ 1566 /* Please note that the recursion is strictly bounded. */
1561 if (array_size > PAGE_SIZE) { 1567 if (array_size > PAGE_SIZE) {
1562 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, 1568 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
1563 PAGE_KERNEL, node, caller); 1569 PAGE_KERNEL, node, area->caller);
1564 area->flags |= VM_VPAGES; 1570 area->flags |= VM_VPAGES;
1565 } else { 1571 } else {
1566 pages = kmalloc_node(array_size, nested_gfp, node); 1572 pages = kmalloc_node(array_size, nested_gfp, node);
1567 } 1573 }
1568 area->pages = pages; 1574 area->pages = pages;
1569 area->caller = caller;
1570 if (!area->pages) { 1575 if (!area->pages) {
1571 remove_vm_area(area->addr); 1576 remove_vm_area(area->addr);
1572 kfree(area); 1577 kfree(area);
@@ -1577,7 +1582,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1577 struct page *page; 1582 struct page *page;
1578 gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; 1583 gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
1579 1584
1580 if (node < 0) 1585 if (node == NUMA_NO_NODE)
1581 page = alloc_page(tmp_mask); 1586 page = alloc_page(tmp_mask);
1582 else 1587 else
1583 page = alloc_pages_node(node, tmp_mask, order); 1588 page = alloc_pages_node(node, tmp_mask, order);
@@ -1634,9 +1639,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1634 if (!area) 1639 if (!area)
1635 goto fail; 1640 goto fail;
1636 1641
1637 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1642 addr = __vmalloc_area_node(area, gfp_mask, prot, node);
1638 if (!addr) 1643 if (!addr)
1639 goto fail; 1644 return NULL;
1640 1645
1641 /* 1646 /*
1642 * In this function, newly allocated vm_struct has VM_UNINITIALIZED 1647 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -1646,11 +1651,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1646 clear_vm_uninitialized_flag(area); 1651 clear_vm_uninitialized_flag(area);
1647 1652
1648 /* 1653 /*
1649 * A ref_count = 3 is needed because the vm_struct and vmap_area 1654 * A ref_count = 2 is needed because vm_struct allocated in
1650 * structures allocated in the __get_vm_area_node() function contain 1655 * __get_vm_area_node() contains a reference to the virtual address of
1651 * references to the virtual address of the vmalloc'ed block. 1656 * the vmalloc'ed block.
1652 */ 1657 */
1653 kmemleak_alloc(addr, real_size, 3, gfp_mask); 1658 kmemleak_alloc(addr, real_size, 2, gfp_mask);
1654 1659
1655 return addr; 1660 return addr;
1656 1661
@@ -2563,6 +2568,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2563 if (!counters) 2568 if (!counters)
2564 return; 2569 return;
2565 2570
2571 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
2572 smp_rmb();
2573 if (v->flags & VM_UNINITIALIZED)
2574 return;
2575
2566 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 2576 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
2567 2577
2568 for (nr = 0; nr < v->nr_pages; nr++) 2578 for (nr = 0; nr < v->nr_pages; nr++)
@@ -2579,23 +2589,15 @@ static int s_show(struct seq_file *m, void *p)
2579 struct vmap_area *va = p; 2589 struct vmap_area *va = p;
2580 struct vm_struct *v; 2590 struct vm_struct *v;
2581 2591
2582 if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) 2592 /*
2593 * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
2594 * behalf of vmap area is being tear down or vm_map_ram allocation.
2595 */
2596 if (!(va->flags & VM_VM_AREA))
2583 return 0; 2597 return 0;
2584 2598
2585 if (!(va->flags & VM_VM_AREA)) {
2586 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
2587 (void *)va->va_start, (void *)va->va_end,
2588 va->va_end - va->va_start);
2589 return 0;
2590 }
2591
2592 v = va->vm; 2599 v = va->vm;
2593 2600
2594 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
2595 smp_rmb();
2596 if (v->flags & VM_UNINITIALIZED)
2597 return 0;
2598
2599 seq_printf(m, "0x%pK-0x%pK %7ld", 2601 seq_printf(m, "0x%pK-0x%pK %7ld",
2600 v->addr, v->addr + v->size, v->size); 2602 v->addr, v->addr + v->size, v->size);
2601 2603
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8ed1b775bdc9..eea668d9cff6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,7 @@
48#include <asm/div64.h> 48#include <asm/div64.h>
49 49
50#include <linux/swapops.h> 50#include <linux/swapops.h>
51#include <linux/balloon_compaction.h>
51 52
52#include "internal.h" 53#include "internal.h"
53 54
@@ -139,23 +140,11 @@ static bool global_reclaim(struct scan_control *sc)
139{ 140{
140 return !sc->target_mem_cgroup; 141 return !sc->target_mem_cgroup;
141} 142}
142
143static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
144{
145 struct mem_cgroup *root = sc->target_mem_cgroup;
146 return !mem_cgroup_disabled() &&
147 mem_cgroup_soft_reclaim_eligible(root, root) != SKIP_TREE;
148}
149#else 143#else
150static bool global_reclaim(struct scan_control *sc) 144static bool global_reclaim(struct scan_control *sc)
151{ 145{
152 return true; 146 return true;
153} 147}
154
155static bool mem_cgroup_should_soft_reclaim(struct scan_control *sc)
156{
157 return false;
158}
159#endif 148#endif
160 149
161unsigned long zone_reclaimable_pages(struct zone *zone) 150unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -222,6 +211,7 @@ void unregister_shrinker(struct shrinker *shrinker)
222 down_write(&shrinker_rwsem); 211 down_write(&shrinker_rwsem);
223 list_del(&shrinker->list); 212 list_del(&shrinker->list);
224 up_write(&shrinker_rwsem); 213 up_write(&shrinker_rwsem);
214 kfree(shrinker->nr_deferred);
225} 215}
226EXPORT_SYMBOL(unregister_shrinker); 216EXPORT_SYMBOL(unregister_shrinker);
227 217
@@ -1125,7 +1115,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1125 LIST_HEAD(clean_pages); 1115 LIST_HEAD(clean_pages);
1126 1116
1127 list_for_each_entry_safe(page, next, page_list, lru) { 1117 list_for_each_entry_safe(page, next, page_list, lru) {
1128 if (page_is_file_cache(page) && !PageDirty(page)) { 1118 if (page_is_file_cache(page) && !PageDirty(page) &&
1119 !isolated_balloon_page(page)) {
1129 ClearPageActive(page); 1120 ClearPageActive(page);
1130 list_move(&page->lru, &clean_pages); 1121 list_move(&page->lru, &clean_pages);
1131 } 1122 }
@@ -2176,11 +2167,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
2176 } 2167 }
2177} 2168}
2178 2169
2179static int 2170static void shrink_zone(struct zone *zone, struct scan_control *sc)
2180__shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2181{ 2171{
2182 unsigned long nr_reclaimed, nr_scanned; 2172 unsigned long nr_reclaimed, nr_scanned;
2183 int groups_scanned = 0;
2184 2173
2185 do { 2174 do {
2186 struct mem_cgroup *root = sc->target_mem_cgroup; 2175 struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2188,17 +2177,15 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2188 .zone = zone, 2177 .zone = zone,
2189 .priority = sc->priority, 2178 .priority = sc->priority,
2190 }; 2179 };
2191 struct mem_cgroup *memcg = NULL; 2180 struct mem_cgroup *memcg;
2192 mem_cgroup_iter_filter filter = (soft_reclaim) ?
2193 mem_cgroup_soft_reclaim_eligible : NULL;
2194 2181
2195 nr_reclaimed = sc->nr_reclaimed; 2182 nr_reclaimed = sc->nr_reclaimed;
2196 nr_scanned = sc->nr_scanned; 2183 nr_scanned = sc->nr_scanned;
2197 2184
2198 while ((memcg = mem_cgroup_iter_cond(root, memcg, &reclaim, filter))) { 2185 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2186 do {
2199 struct lruvec *lruvec; 2187 struct lruvec *lruvec;
2200 2188
2201 groups_scanned++;
2202 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2189 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2203 2190
2204 shrink_lruvec(lruvec, sc); 2191 shrink_lruvec(lruvec, sc);
@@ -2218,7 +2205,8 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2218 mem_cgroup_iter_break(root, memcg); 2205 mem_cgroup_iter_break(root, memcg);
2219 break; 2206 break;
2220 } 2207 }
2221 } 2208 memcg = mem_cgroup_iter(root, memcg, &reclaim);
2209 } while (memcg);
2222 2210
2223 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2211 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
2224 sc->nr_scanned - nr_scanned, 2212 sc->nr_scanned - nr_scanned,
@@ -2226,37 +2214,6 @@ __shrink_zone(struct zone *zone, struct scan_control *sc, bool soft_reclaim)
2226 2214
2227 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, 2215 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
2228 sc->nr_scanned - nr_scanned, sc)); 2216 sc->nr_scanned - nr_scanned, sc));
2229
2230 return groups_scanned;
2231}
2232
2233
2234static void shrink_zone(struct zone *zone, struct scan_control *sc)
2235{
2236 bool do_soft_reclaim = mem_cgroup_should_soft_reclaim(sc);
2237 unsigned long nr_scanned = sc->nr_scanned;
2238 int scanned_groups;
2239
2240 scanned_groups = __shrink_zone(zone, sc, do_soft_reclaim);
2241 /*
2242 * memcg iterator might race with other reclaimer or start from
2243 * a incomplete tree walk so the tree walk in __shrink_zone
2244 * might have missed groups that are above the soft limit. Try
2245 * another loop to catch up with others. Do it just once to
2246 * prevent from reclaim latencies when other reclaimers always
2247 * preempt this one.
2248 */
2249 if (do_soft_reclaim && !scanned_groups)
2250 __shrink_zone(zone, sc, do_soft_reclaim);
2251
2252 /*
2253 * No group is over the soft limit or those that are do not have
2254 * pages in the zone we are reclaiming so we have to reclaim everybody
2255 */
2256 if (do_soft_reclaim && (sc->nr_scanned == nr_scanned)) {
2257 __shrink_zone(zone, sc, false);
2258 return;
2259 }
2260} 2217}
2261 2218
2262/* Returns true if compaction should go ahead for a high-order request */ 2219/* Returns true if compaction should go ahead for a high-order request */
@@ -2320,6 +2277,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2320{ 2277{
2321 struct zoneref *z; 2278 struct zoneref *z;
2322 struct zone *zone; 2279 struct zone *zone;
2280 unsigned long nr_soft_reclaimed;
2281 unsigned long nr_soft_scanned;
2323 bool aborted_reclaim = false; 2282 bool aborted_reclaim = false;
2324 2283
2325 /* 2284 /*
@@ -2359,6 +2318,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2359 continue; 2318 continue;
2360 } 2319 }
2361 } 2320 }
2321 /*
2322 * This steals pages from memory cgroups over softlimit
2323 * and returns the number of reclaimed pages and
2324 * scanned pages. This works for global memory pressure
2325 * and balancing, not for a memcg's limit.
2326 */
2327 nr_soft_scanned = 0;
2328 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2329 sc->order, sc->gfp_mask,
2330 &nr_soft_scanned);
2331 sc->nr_reclaimed += nr_soft_reclaimed;
2332 sc->nr_scanned += nr_soft_scanned;
2362 /* need some check for avoid more shrink_zone() */ 2333 /* need some check for avoid more shrink_zone() */
2363 } 2334 }
2364 2335
@@ -2952,6 +2923,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2952{ 2923{
2953 int i; 2924 int i;
2954 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2925 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2926 unsigned long nr_soft_reclaimed;
2927 unsigned long nr_soft_scanned;
2955 struct scan_control sc = { 2928 struct scan_control sc = {
2956 .gfp_mask = GFP_KERNEL, 2929 .gfp_mask = GFP_KERNEL,
2957 .priority = DEF_PRIORITY, 2930 .priority = DEF_PRIORITY,
@@ -3066,6 +3039,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3066 3039
3067 sc.nr_scanned = 0; 3040 sc.nr_scanned = 0;
3068 3041
3042 nr_soft_scanned = 0;
3043 /*
3044 * Call soft limit reclaim before calling shrink_zone.
3045 */
3046 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
3047 order, sc.gfp_mask,
3048 &nr_soft_scanned);
3049 sc.nr_reclaimed += nr_soft_reclaimed;
3050
3069 /* 3051 /*
3070 * There should be no need to raise the scanning 3052 * There should be no need to raise the scanning
3071 * priority if enough pages are already being scanned 3053 * priority if enough pages are already being scanned
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9bb314577911..72496140ac08 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -812,6 +812,7 @@ const char * const vmstat_text[] = {
812 812
813#ifdef CONFIG_NUMA_BALANCING 813#ifdef CONFIG_NUMA_BALANCING
814 "numa_pte_updates", 814 "numa_pte_updates",
815 "numa_huge_pte_updates",
815 "numa_hint_faults", 816 "numa_hint_faults",
816 "numa_hint_faults_local", 817 "numa_hint_faults_local",
817 "numa_pages_migrated", 818 "numa_pages_migrated",
@@ -1229,6 +1230,20 @@ static void start_cpu_timer(int cpu)
1229 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); 1230 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
1230} 1231}
1231 1232
1233static void vmstat_cpu_dead(int node)
1234{
1235 int cpu;
1236
1237 get_online_cpus();
1238 for_each_online_cpu(cpu)
1239 if (cpu_to_node(cpu) == node)
1240 goto end;
1241
1242 node_clear_state(node, N_CPU);
1243end:
1244 put_online_cpus();
1245}
1246
1232/* 1247/*
1233 * Use the cpu notifier to insure that the thresholds are recalculated 1248 * Use the cpu notifier to insure that the thresholds are recalculated
1234 * when necessary. 1249 * when necessary.
@@ -1258,6 +1273,7 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
1258 case CPU_DEAD: 1273 case CPU_DEAD:
1259 case CPU_DEAD_FROZEN: 1274 case CPU_DEAD_FROZEN:
1260 refresh_zone_stat_thresholds(); 1275 refresh_zone_stat_thresholds();
1276 vmstat_cpu_dead(cpu_to_node(cpu));
1261 break; 1277 break;
1262 default: 1278 default:
1263 break; 1279 break;
@@ -1276,8 +1292,12 @@ static int __init setup_vmstat(void)
1276 1292
1277 register_cpu_notifier(&vmstat_notifier); 1293 register_cpu_notifier(&vmstat_notifier);
1278 1294
1279 for_each_online_cpu(cpu) 1295 get_online_cpus();
1296 for_each_online_cpu(cpu) {
1280 start_cpu_timer(cpu); 1297 start_cpu_timer(cpu);
1298 node_set_state(cpu_to_node(cpu), N_CPU);
1299 }
1300 put_online_cpus();
1281#endif 1301#endif
1282#ifdef CONFIG_PROC_FS 1302#ifdef CONFIG_PROC_FS
1283 proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); 1303 proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
diff --git a/mm/zswap.c b/mm/zswap.c
index 841e35f1db22..5a63f78a5601 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -217,6 +217,7 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
217 if (!entry) 217 if (!entry)
218 return NULL; 218 return NULL;
219 entry->refcount = 1; 219 entry->refcount = 1;
220 RB_CLEAR_NODE(&entry->rbnode);
220 return entry; 221 return entry;
221} 222}
222 223
@@ -225,19 +226,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
225 kmem_cache_free(zswap_entry_cache, entry); 226 kmem_cache_free(zswap_entry_cache, entry);
226} 227}
227 228
228/* caller must hold the tree lock */
229static void zswap_entry_get(struct zswap_entry *entry)
230{
231 entry->refcount++;
232}
233
234/* caller must hold the tree lock */
235static int zswap_entry_put(struct zswap_entry *entry)
236{
237 entry->refcount--;
238 return entry->refcount;
239}
240
241/********************************* 229/*********************************
242* rbtree functions 230* rbtree functions
243**********************************/ 231**********************************/
@@ -285,6 +273,61 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
285 return 0; 273 return 0;
286} 274}
287 275
276static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
277{
278 if (!RB_EMPTY_NODE(&entry->rbnode)) {
279 rb_erase(&entry->rbnode, root);
280 RB_CLEAR_NODE(&entry->rbnode);
281 }
282}
283
284/*
285 * Carries out the common pattern of freeing and entry's zsmalloc allocation,
286 * freeing the entry itself, and decrementing the number of stored pages.
287 */
288static void zswap_free_entry(struct zswap_tree *tree,
289 struct zswap_entry *entry)
290{
291 zbud_free(tree->pool, entry->handle);
292 zswap_entry_cache_free(entry);
293 atomic_dec(&zswap_stored_pages);
294 zswap_pool_pages = zbud_get_pool_size(tree->pool);
295}
296
297/* caller must hold the tree lock */
298static void zswap_entry_get(struct zswap_entry *entry)
299{
300 entry->refcount++;
301}
302
303/* caller must hold the tree lock
304* remove from the tree and free it, if nobody reference the entry
305*/
306static void zswap_entry_put(struct zswap_tree *tree,
307 struct zswap_entry *entry)
308{
309 int refcount = --entry->refcount;
310
311 BUG_ON(refcount < 0);
312 if (refcount == 0) {
313 zswap_rb_erase(&tree->rbroot, entry);
314 zswap_free_entry(tree, entry);
315 }
316}
317
318/* caller must hold the tree lock */
319static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
320 pgoff_t offset)
321{
322 struct zswap_entry *entry = NULL;
323
324 entry = zswap_rb_search(root, offset);
325 if (entry)
326 zswap_entry_get(entry);
327
328 return entry;
329}
330
288/********************************* 331/*********************************
289* per-cpu code 332* per-cpu code
290**********************************/ 333**********************************/
@@ -368,18 +411,6 @@ static bool zswap_is_full(void)
368 zswap_pool_pages); 411 zswap_pool_pages);
369} 412}
370 413
371/*
372 * Carries out the common pattern of freeing and entry's zsmalloc allocation,
373 * freeing the entry itself, and decrementing the number of stored pages.
374 */
375static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
376{
377 zbud_free(tree->pool, entry->handle);
378 zswap_entry_cache_free(entry);
379 atomic_dec(&zswap_stored_pages);
380 zswap_pool_pages = zbud_get_pool_size(tree->pool);
381}
382
383/********************************* 414/*********************************
384* writeback code 415* writeback code
385**********************************/ 416**********************************/
@@ -387,7 +418,7 @@ static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
387enum zswap_get_swap_ret { 418enum zswap_get_swap_ret {
388 ZSWAP_SWAPCACHE_NEW, 419 ZSWAP_SWAPCACHE_NEW,
389 ZSWAP_SWAPCACHE_EXIST, 420 ZSWAP_SWAPCACHE_EXIST,
390 ZSWAP_SWAPCACHE_NOMEM 421 ZSWAP_SWAPCACHE_FAIL,
391}; 422};
392 423
393/* 424/*
@@ -401,9 +432,10 @@ enum zswap_get_swap_ret {
401 * added to the swap cache, and returned in retpage. 432 * added to the swap cache, and returned in retpage.
402 * 433 *
403 * If success, the swap cache page is returned in retpage 434 * If success, the swap cache page is returned in retpage
404 * Returns 0 if page was already in the swap cache, page is not locked 435 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
405 * Returns 1 if the new page needs to be populated, page is locked 436 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
406 * Returns <0 on error 437 * the new page is added to swapcache and locked
438 * Returns ZSWAP_SWAPCACHE_FAIL on error
407 */ 439 */
408static int zswap_get_swap_cache_page(swp_entry_t entry, 440static int zswap_get_swap_cache_page(swp_entry_t entry,
409 struct page **retpage) 441 struct page **retpage)
@@ -475,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
475 if (new_page) 507 if (new_page)
476 page_cache_release(new_page); 508 page_cache_release(new_page);
477 if (!found_page) 509 if (!found_page)
478 return ZSWAP_SWAPCACHE_NOMEM; 510 return ZSWAP_SWAPCACHE_FAIL;
479 *retpage = found_page; 511 *retpage = found_page;
480 return ZSWAP_SWAPCACHE_EXIST; 512 return ZSWAP_SWAPCACHE_EXIST;
481} 513}
@@ -502,7 +534,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
502 struct page *page; 534 struct page *page;
503 u8 *src, *dst; 535 u8 *src, *dst;
504 unsigned int dlen; 536 unsigned int dlen;
505 int ret, refcount; 537 int ret;
506 struct writeback_control wbc = { 538 struct writeback_control wbc = {
507 .sync_mode = WB_SYNC_NONE, 539 .sync_mode = WB_SYNC_NONE,
508 }; 540 };
@@ -517,23 +549,22 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
517 549
518 /* find and ref zswap entry */ 550 /* find and ref zswap entry */
519 spin_lock(&tree->lock); 551 spin_lock(&tree->lock);
520 entry = zswap_rb_search(&tree->rbroot, offset); 552 entry = zswap_entry_find_get(&tree->rbroot, offset);
521 if (!entry) { 553 if (!entry) {
522 /* entry was invalidated */ 554 /* entry was invalidated */
523 spin_unlock(&tree->lock); 555 spin_unlock(&tree->lock);
524 return 0; 556 return 0;
525 } 557 }
526 zswap_entry_get(entry);
527 spin_unlock(&tree->lock); 558 spin_unlock(&tree->lock);
528 BUG_ON(offset != entry->offset); 559 BUG_ON(offset != entry->offset);
529 560
530 /* try to allocate swap cache page */ 561 /* try to allocate swap cache page */
531 switch (zswap_get_swap_cache_page(swpentry, &page)) { 562 switch (zswap_get_swap_cache_page(swpentry, &page)) {
532 case ZSWAP_SWAPCACHE_NOMEM: /* no memory */ 563 case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
533 ret = -ENOMEM; 564 ret = -ENOMEM;
534 goto fail; 565 goto fail;
535 566
536 case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */ 567 case ZSWAP_SWAPCACHE_EXIST:
537 /* page is already in the swap cache, ignore for now */ 568 /* page is already in the swap cache, ignore for now */
538 page_cache_release(page); 569 page_cache_release(page);
539 ret = -EEXIST; 570 ret = -EEXIST;
@@ -556,43 +587,44 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
556 SetPageUptodate(page); 587 SetPageUptodate(page);
557 } 588 }
558 589
590 /* move it to the tail of the inactive list after end_writeback */
591 SetPageReclaim(page);
592
559 /* start writeback */ 593 /* start writeback */
560 __swap_writepage(page, &wbc, end_swap_bio_write); 594 __swap_writepage(page, &wbc, end_swap_bio_write);
561 page_cache_release(page); 595 page_cache_release(page);
562 zswap_written_back_pages++; 596 zswap_written_back_pages++;
563 597
564 spin_lock(&tree->lock); 598 spin_lock(&tree->lock);
565
566 /* drop local reference */ 599 /* drop local reference */
567 zswap_entry_put(entry); 600 zswap_entry_put(tree, entry);
568 /* drop the initial reference from entry creation */
569 refcount = zswap_entry_put(entry);
570 601
571 /* 602 /*
572 * There are three possible values for refcount here: 603 * There are two possible situations for entry here:
573 * (1) refcount is 1, load is in progress, unlink from rbtree, 604 * (1) refcount is 1(normal case), entry is valid and on the tree
574 * load will free 605 * (2) refcount is 0, entry is freed and not on the tree
575 * (2) refcount is 0, (normal case) entry is valid, 606 * because invalidate happened during writeback
576 * remove from rbtree and free entry 607 * search the tree and free the entry if find entry
577 * (3) refcount is -1, invalidate happened during writeback, 608 */
578 * free entry 609 if (entry == zswap_rb_search(&tree->rbroot, offset))
579 */ 610 zswap_entry_put(tree, entry);
580 if (refcount >= 0) {
581 /* no invalidate yet, remove from rbtree */
582 rb_erase(&entry->rbnode, &tree->rbroot);
583 }
584 spin_unlock(&tree->lock); 611 spin_unlock(&tree->lock);
585 if (refcount <= 0) {
586 /* free the entry */
587 zswap_free_entry(tree, entry);
588 return 0;
589 }
590 return -EAGAIN;
591 612
613 goto end;
614
615 /*
616 * if we get here due to ZSWAP_SWAPCACHE_EXIST
617 * a load may happening concurrently
618 * it is safe and okay to not free the entry
619 * if we free the entry in the following put
620 * it it either okay to return !0
621 */
592fail: 622fail:
593 spin_lock(&tree->lock); 623 spin_lock(&tree->lock);
594 zswap_entry_put(entry); 624 zswap_entry_put(tree, entry);
595 spin_unlock(&tree->lock); 625 spin_unlock(&tree->lock);
626
627end:
596 return ret; 628 return ret;
597} 629}
598 630
@@ -676,11 +708,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
676 if (ret == -EEXIST) { 708 if (ret == -EEXIST) {
677 zswap_duplicate_entry++; 709 zswap_duplicate_entry++;
678 /* remove from rbtree */ 710 /* remove from rbtree */
679 rb_erase(&dupentry->rbnode, &tree->rbroot); 711 zswap_rb_erase(&tree->rbroot, dupentry);
680 if (!zswap_entry_put(dupentry)) { 712 zswap_entry_put(tree, dupentry);
681 /* free */
682 zswap_free_entry(tree, dupentry);
683 }
684 } 713 }
685 } while (ret == -EEXIST); 714 } while (ret == -EEXIST);
686 spin_unlock(&tree->lock); 715 spin_unlock(&tree->lock);
@@ -709,17 +738,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
709 struct zswap_entry *entry; 738 struct zswap_entry *entry;
710 u8 *src, *dst; 739 u8 *src, *dst;
711 unsigned int dlen; 740 unsigned int dlen;
712 int refcount, ret; 741 int ret;
713 742
714 /* find */ 743 /* find */
715 spin_lock(&tree->lock); 744 spin_lock(&tree->lock);
716 entry = zswap_rb_search(&tree->rbroot, offset); 745 entry = zswap_entry_find_get(&tree->rbroot, offset);
717 if (!entry) { 746 if (!entry) {
718 /* entry was written back */ 747 /* entry was written back */
719 spin_unlock(&tree->lock); 748 spin_unlock(&tree->lock);
720 return -1; 749 return -1;
721 } 750 }
722 zswap_entry_get(entry);
723 spin_unlock(&tree->lock); 751 spin_unlock(&tree->lock);
724 752
725 /* decompress */ 753 /* decompress */
@@ -734,22 +762,9 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
734 BUG_ON(ret); 762 BUG_ON(ret);
735 763
736 spin_lock(&tree->lock); 764 spin_lock(&tree->lock);
737 refcount = zswap_entry_put(entry); 765 zswap_entry_put(tree, entry);
738 if (likely(refcount)) {
739 spin_unlock(&tree->lock);
740 return 0;
741 }
742 spin_unlock(&tree->lock); 766 spin_unlock(&tree->lock);
743 767
744 /*
745 * We don't have to unlink from the rbtree because
746 * zswap_writeback_entry() or zswap_frontswap_invalidate page()
747 * has already done this for us if we are the last reference.
748 */
749 /* free */
750
751 zswap_free_entry(tree, entry);
752
753 return 0; 768 return 0;
754} 769}
755 770
@@ -758,7 +773,6 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
758{ 773{
759 struct zswap_tree *tree = zswap_trees[type]; 774 struct zswap_tree *tree = zswap_trees[type];
760 struct zswap_entry *entry; 775 struct zswap_entry *entry;
761 int refcount;
762 776
763 /* find */ 777 /* find */
764 spin_lock(&tree->lock); 778 spin_lock(&tree->lock);
@@ -770,20 +784,12 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
770 } 784 }
771 785
772 /* remove from rbtree */ 786 /* remove from rbtree */
773 rb_erase(&entry->rbnode, &tree->rbroot); 787 zswap_rb_erase(&tree->rbroot, entry);
774 788
775 /* drop the initial reference from entry creation */ 789 /* drop the initial reference from entry creation */
776 refcount = zswap_entry_put(entry); 790 zswap_entry_put(tree, entry);
777 791
778 spin_unlock(&tree->lock); 792 spin_unlock(&tree->lock);
779
780 if (refcount) {
781 /* writeback in progress, writeback will free */
782 return;
783 }
784
785 /* free */
786 zswap_free_entry(tree, entry);
787} 793}
788 794
789/* frees all zswap entries for the given swap type */ 795/* frees all zswap entries for the given swap type */
@@ -797,13 +803,14 @@ static void zswap_frontswap_invalidate_area(unsigned type)
797 803
798 /* walk the tree and free everything */ 804 /* walk the tree and free everything */
799 spin_lock(&tree->lock); 805 spin_lock(&tree->lock);
800 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) { 806 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
801 zbud_free(tree->pool, entry->handle); 807 zswap_free_entry(tree, entry);
802 zswap_entry_cache_free(entry);
803 atomic_dec(&zswap_stored_pages);
804 }
805 tree->rbroot = RB_ROOT; 808 tree->rbroot = RB_ROOT;
806 spin_unlock(&tree->lock); 809 spin_unlock(&tree->lock);
810
811 zbud_destroy_pool(tree->pool);
812 kfree(tree);
813 zswap_trees[type] = NULL;
807} 814}
808 815
809static struct zbud_ops zswap_zbud_ops = { 816static struct zbud_ops zswap_zbud_ops = {