aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-mm-ksm52
-rw-r--r--Documentation/kernel-parameters.txt36
-rw-r--r--Documentation/vm/ksm.txt15
-rw-r--r--arch/arm64/mm/mmu.c3
-rw-r--r--arch/ia64/mm/contig.c2
-rw-r--r--arch/ia64/mm/discontig.c6
-rw-r--r--arch/ia64/mm/init.c18
-rw-r--r--arch/powerpc/mm/init_64.c5
-rw-r--r--arch/powerpc/mm/mem.c12
-rw-r--r--arch/s390/mm/init.c12
-rw-r--r--arch/s390/mm/vmem.c4
-rw-r--r--arch/sh/mm/init.c17
-rw-r--r--arch/sparc/mm/init_32.c2
-rw-r--r--arch/sparc/mm/init_64.c5
-rw-r--r--arch/tile/mm/elf.c1
-rw-r--r--arch/tile/mm/init.c8
-rw-r--r--arch/tile/mm/pgtable.c2
-rw-r--r--arch/x86/include/asm/numa.h4
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/setup.c13
-rw-r--r--arch/x86/mm/init_32.c12
-rw-r--r--arch/x86/mm/init_64.c397
-rw-r--r--arch/x86/mm/numa.c17
-rw-r--r--arch/x86/mm/pageattr.c47
-rw-r--r--arch/x86/mm/srat.c125
-rw-r--r--block/genhd.c10
-rw-r--r--drivers/acpi/acpi_memhotplug.c8
-rw-r--r--drivers/acpi/numa.c23
-rw-r--r--drivers/acpi/processor_driver.c2
-rw-r--r--drivers/base/memory.c6
-rw-r--r--drivers/base/power/runtime.c89
-rw-r--r--drivers/firmware/memmap.c196
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c14
-rw-r--r--drivers/staging/zcache/zbud.c2
-rw-r--r--drivers/staging/zsmalloc/zsmalloc-main.c2
-rw-r--r--drivers/usb/core/hub.c13
-rw-r--r--fs/aio.c7
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/nfsd/nfs4state.c6
-rw-r--r--fs/nfsd/nfsd.h6
-rw-r--r--fs/nfsd/nfssvc.c6
-rw-r--r--fs/proc/meminfo.c6
-rw-r--r--include/linux/acpi.h8
-rw-r--r--include/linux/bootmem.h1
-rw-r--r--include/linux/compaction.h5
-rw-r--r--include/linux/firmware-map.h6
-rw-r--r--include/linux/highmem.h6
-rw-r--r--include/linux/huge_mm.h2
-rw-r--r--include/linux/hugetlb.h6
-rw-r--r--include/linux/ksm.h18
-rw-r--r--include/linux/memblock.h2
-rw-r--r--include/linux/memcontrol.h7
-rw-r--r--include/linux/memory_hotplug.h20
-rw-r--r--include/linux/migrate.h14
-rw-r--r--include/linux/mm.h178
-rw-r--r--include/linux/mm_types.h9
-rw-r--r--include/linux/mman.h4
-rw-r--r--include/linux/mmzone.h58
-rw-r--r--include/linux/page-flags-layout.h88
-rw-r--r--include/linux/page-isolation.h19
-rw-r--r--include/linux/pm.h1
-rw-r--r--include/linux/pm_runtime.h3
-rw-r--r--include/linux/rmap.h2
-rw-r--r--include/linux/sched.h22
-rw-r--r--include/linux/swap.h49
-rw-r--r--include/linux/vm_event_item.h1
-rw-r--r--include/linux/vmstat.h2
-rw-r--r--ipc/shm.c12
-rw-r--r--kernel/sched/core.c28
-rw-r--r--kernel/sysctl.c1
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/compaction.c35
-rw-r--r--mm/fadvise.c18
-rw-r--r--mm/fremap.c51
-rw-r--r--mm/huge_memory.c95
-rw-r--r--mm/hugetlb.c34
-rw-r--r--mm/internal.h4
-rw-r--r--mm/kmemleak.c5
-rw-r--r--mm/ksm.c657
-rw-r--r--mm/madvise.c105
-rw-r--r--mm/memblock.c50
-rw-r--r--mm/memcontrol.c473
-rw-r--r--mm/memory-failure.c202
-rw-r--r--mm/memory.c125
-rw-r--r--mm/memory_hotplug.c553
-rw-r--r--mm/mempolicy.c59
-rw-r--r--mm/migrate.c164
-rw-r--r--mm/mincore.c5
-rw-r--r--mm/mlock.c101
-rw-r--r--mm/mm_init.c31
-rw-r--r--mm/mmap.c83
-rw-r--r--mm/mmu_notifier.c84
-rw-r--r--mm/mmzone.c20
-rw-r--r--mm/mremap.c27
-rw-r--r--mm/nommu.c28
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c3
-rw-r--r--mm/page_alloc.c439
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/shmem.c48
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c2
-rw-r--r--mm/sparse.c12
-rw-r--r--mm/swap.c9
-rw-r--r--mm/swap_state.c58
-rw-r--r--mm/swapfile.c174
-rw-r--r--mm/util.c26
-rw-r--r--mm/vmalloc.c33
-rw-r--r--mm/vmscan.c397
-rw-r--r--mm/vmstat.c7
-rw-r--r--net/9p/trans_virtio.c2
-rw-r--r--net/core/net-sysfs.c5
113 files changed, 4408 insertions, 1632 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
new file mode 100644
index 000000000000..73e653ee2481
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
@@ -0,0 +1,52 @@
1What: /sys/kernel/mm/ksm
2Date: September 2009
3KernelVersion: 2.6.32
4Contact: Linux memory management mailing list <linux-mm@kvack.org>
5Description: Interface for Kernel Samepage Merging (KSM)
6
7What: /sys/kernel/mm/ksm/full_scans
8What: /sys/kernel/mm/ksm/pages_shared
9What: /sys/kernel/mm/ksm/pages_sharing
10What: /sys/kernel/mm/ksm/pages_to_scan
11What: /sys/kernel/mm/ksm/pages_unshared
12What: /sys/kernel/mm/ksm/pages_volatile
13What: /sys/kernel/mm/ksm/run
14What: /sys/kernel/mm/ksm/sleep_millisecs
15Date: September 2009
16Contact: Linux memory management mailing list <linux-mm@kvack.org>
17Description: Kernel Samepage Merging daemon sysfs interface
18
19 full_scans: how many times all mergeable areas have been
20 scanned.
21
22 pages_shared: how many shared pages are being used.
23
24 pages_sharing: how many more sites are sharing them i.e. how
25 much saved.
26
27 pages_to_scan: how many present pages to scan before ksmd goes
28 to sleep.
29
30 pages_unshared: how many pages unique but repeatedly checked
31 for merging.
32
33 pages_volatile: how many pages changing too fast to be placed
34 in a tree.
35
36 run: write 0 to disable ksm, read 0 while ksm is disabled.
37 write 1 to run ksm, read 1 while ksm is running.
38 write 2 to disable ksm and unmerge all its pages.
39
40 sleep_millisecs: how many milliseconds ksm should sleep between
41 scans.
42
43 See Documentation/vm/ksm.txt for more information.
44
45What: /sys/kernel/mm/ksm/merge_across_nodes
46Date: January 2013
47KernelVersion: 3.9
48Contact: Linux memory management mailing list <linux-mm@kvack.org>
49Description: Control merging pages across different NUMA nodes.
50
51 When it is set to 0 only pages from the same node are merged,
52 otherwise pages from all nodes can be merged together (default).
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9aa8ff3e54dc..766087781ecd 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1640,6 +1640,42 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1640 that the amount of memory usable for all allocations 1640 that the amount of memory usable for all allocations
1641 is not too small. 1641 is not too small.
1642 1642
1643 movablemem_map=acpi
1644 [KNL,X86,IA-64,PPC] This parameter is similar to
1645 memmap except it specifies the memory map of
1646 ZONE_MOVABLE.
1647 This option inform the kernel to use Hot Pluggable bit
1648 in flags from SRAT from ACPI BIOS to determine which
1649 memory devices could be hotplugged. The corresponding
1650 memory ranges will be set as ZONE_MOVABLE.
1651 NOTE: Whatever node the kernel resides in will always
1652 be un-hotpluggable.
1653
1654 movablemem_map=nn[KMG]@ss[KMG]
1655 [KNL,X86,IA-64,PPC] This parameter is similar to
1656 memmap except it specifies the memory map of
1657 ZONE_MOVABLE.
1658 If user specifies memory ranges, the info in SRAT will
1659 be ingored. And it works like the following:
1660 - If more ranges are all within one node, then from
1661 lowest ss to the end of the node will be ZONE_MOVABLE.
1662 - If a range is within a node, then from ss to the end
1663 of the node will be ZONE_MOVABLE.
1664 - If a range covers two or more nodes, then from ss to
1665 the end of the 1st node will be ZONE_MOVABLE, and all
1666 the rest nodes will only have ZONE_MOVABLE.
1667 If memmap is specified at the same time, the
1668 movablemem_map will be limited within the memmap
1669 areas. If kernelcore or movablecore is also specified,
1670 movablemem_map will have higher priority to be
1671 satisfied. So the administrator should be careful that
1672 the amount of movablemem_map areas are not too large.
1673 Otherwise kernel won't have enough memory to start.
1674 NOTE: We don't stop users specifying the node the
1675 kernel resides in as hotpluggable so that this
1676 option can be used as a workaround of firmware
1677 bugs.
1678
1643 MTD_Partition= [MTD] 1679 MTD_Partition= [MTD]
1644 Format: <name>,<region-number>,<size>,<offset> 1680 Format: <name>,<region-number>,<size>,<offset>
1645 1681
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt
index b392e496f816..f34a8ee6f860 100644
--- a/Documentation/vm/ksm.txt
+++ b/Documentation/vm/ksm.txt
@@ -58,6 +58,21 @@ sleep_millisecs - how many milliseconds ksmd should sleep before next scan
58 e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs" 58 e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs"
59 Default: 20 (chosen for demonstration purposes) 59 Default: 20 (chosen for demonstration purposes)
60 60
61merge_across_nodes - specifies if pages from different numa nodes can be merged.
62 When set to 0, ksm merges only pages which physically
63 reside in the memory area of same NUMA node. That brings
64 lower latency to access of shared pages. Systems with more
65 nodes, at significant NUMA distances, are likely to benefit
66 from the lower latency of setting 0. Smaller systems, which
67 need to minimize memory usage, are likely to benefit from
68 the greater sharing of setting 1 (default). You may wish to
69 compare how your system performs under each setting, before
70 deciding on which to use. merge_across_nodes setting can be
71 changed only when there are no ksm shared pages in system:
72 set run 2 to unmerge pages first, then to 1 after changing
73 merge_across_nodes, to remerge according to the new setting.
74 Default: 1 (merging across nodes as in earlier releases)
75
61run - set 0 to stop ksmd from running but keep merged pages, 76run - set 0 to stop ksmd from running but keep merged pages,
62 set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run", 77 set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run",
63 set 2 to stop ksmd and unmerge all pages currently merged, 78 set 2 to stop ksmd and unmerge all pages currently merged,
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index f4dd585898c5..224b44ab534e 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -434,4 +434,7 @@ int __meminit vmemmap_populate(struct page *start_page,
434 return 0; 434 return 0;
435} 435}
436#endif /* CONFIG_ARM64_64K_PAGES */ 436#endif /* CONFIG_ARM64_64K_PAGES */
437void vmemmap_free(struct page *memmap, unsigned long nr_pages)
438{
439}
437#endif /* CONFIG_SPARSEMEM_VMEMMAP */ 440#endif /* CONFIG_SPARSEMEM_VMEMMAP */
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index 1516d1dc11fd..80dab509dfb0 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -93,7 +93,7 @@ void show_mem(unsigned int filter)
93 printk(KERN_INFO "%d pages swap cached\n", total_cached); 93 printk(KERN_INFO "%d pages swap cached\n", total_cached);
94 printk(KERN_INFO "Total of %ld pages in page table cache\n", 94 printk(KERN_INFO "Total of %ld pages in page table cache\n",
95 quicklist_total_size()); 95 quicklist_total_size());
96 printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages()); 96 printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
97} 97}
98 98
99 99
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c641333cd997..c2e955ee79a8 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -666,7 +666,7 @@ void show_mem(unsigned int filter)
666 printk(KERN_INFO "%d pages swap cached\n", total_cached); 666 printk(KERN_INFO "%d pages swap cached\n", total_cached);
667 printk(KERN_INFO "Total of %ld pages in page table cache\n", 667 printk(KERN_INFO "Total of %ld pages in page table cache\n",
668 quicklist_total_size()); 668 quicklist_total_size());
669 printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages()); 669 printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages());
670} 670}
671 671
672/** 672/**
@@ -822,4 +822,8 @@ int __meminit vmemmap_populate(struct page *start_page,
822{ 822{
823 return vmemmap_populate_basepages(start_page, size, node); 823 return vmemmap_populate_basepages(start_page, size, node);
824} 824}
825
826void vmemmap_free(struct page *memmap, unsigned long nr_pages)
827{
828}
825#endif 829#endif
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b755ea92aea7..20bc967c7209 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -688,6 +688,24 @@ int arch_add_memory(int nid, u64 start, u64 size)
688 688
689 return ret; 689 return ret;
690} 690}
691
692#ifdef CONFIG_MEMORY_HOTREMOVE
693int arch_remove_memory(u64 start, u64 size)
694{
695 unsigned long start_pfn = start >> PAGE_SHIFT;
696 unsigned long nr_pages = size >> PAGE_SHIFT;
697 struct zone *zone;
698 int ret;
699
700 zone = page_zone(pfn_to_page(start_pfn));
701 ret = __remove_pages(zone, start_pfn, nr_pages);
702 if (ret)
703 pr_warn("%s: Problem encountered in __remove_pages() as"
704 " ret=%d\n", __func__, ret);
705
706 return ret;
707}
708#endif
691#endif 709#endif
692 710
693/* 711/*
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 95a45293e5ac..7e2246fb2f31 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -297,5 +297,10 @@ int __meminit vmemmap_populate(struct page *start_page,
297 297
298 return 0; 298 return 0;
299} 299}
300
301void vmemmap_free(struct page *memmap, unsigned long nr_pages)
302{
303}
304
300#endif /* CONFIG_SPARSEMEM_VMEMMAP */ 305#endif /* CONFIG_SPARSEMEM_VMEMMAP */
301 306
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 40df7c8f2096..f1f7409a4183 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -133,6 +133,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
133 133
134 return __add_pages(nid, zone, start_pfn, nr_pages); 134 return __add_pages(nid, zone, start_pfn, nr_pages);
135} 135}
136
137#ifdef CONFIG_MEMORY_HOTREMOVE
138int arch_remove_memory(u64 start, u64 size)
139{
140 unsigned long start_pfn = start >> PAGE_SHIFT;
141 unsigned long nr_pages = size >> PAGE_SHIFT;
142 struct zone *zone;
143
144 zone = page_zone(pfn_to_page(start_pfn));
145 return __remove_pages(zone, start_pfn, nr_pages);
146}
147#endif
136#endif /* CONFIG_MEMORY_HOTPLUG */ 148#endif /* CONFIG_MEMORY_HOTPLUG */
137 149
138/* 150/*
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index ae672f41c464..49ce6bb2c641 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -228,4 +228,16 @@ int arch_add_memory(int nid, u64 start, u64 size)
228 vmem_remove_mapping(start, size); 228 vmem_remove_mapping(start, size);
229 return rc; 229 return rc;
230} 230}
231
232#ifdef CONFIG_MEMORY_HOTREMOVE
233int arch_remove_memory(u64 start, u64 size)
234{
235 /*
236 * There is no hardware or firmware interface which could trigger a
237 * hot memory remove on s390. So there is nothing that needs to be
238 * implemented.
239 */
240 return -EBUSY;
241}
242#endif
231#endif /* CONFIG_MEMORY_HOTPLUG */ 243#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 79699f46a443..e21aaf4f5cb6 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -268,6 +268,10 @@ out:
268 return ret; 268 return ret;
269} 269}
270 270
271void vmemmap_free(struct page *memmap, unsigned long nr_pages)
272{
273}
274
271/* 275/*
272 * Add memory segment to the segment list if it doesn't overlap with 276 * Add memory segment to the segment list if it doesn't overlap with
273 * an already present segment. 277 * an already present segment.
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 82cc576fab15..105794037143 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -558,4 +558,21 @@ int memory_add_physaddr_to_nid(u64 addr)
558EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); 558EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
559#endif 559#endif
560 560
561#ifdef CONFIG_MEMORY_HOTREMOVE
562int arch_remove_memory(u64 start, u64 size)
563{
564 unsigned long start_pfn = start >> PAGE_SHIFT;
565 unsigned long nr_pages = size >> PAGE_SHIFT;
566 struct zone *zone;
567 int ret;
568
569 zone = page_zone(pfn_to_page(start_pfn));
570 ret = __remove_pages(zone, start_pfn, nr_pages);
571 if (unlikely(ret))
572 pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
573 ret);
574
575 return ret;
576}
577#endif
561#endif /* CONFIG_MEMORY_HOTPLUG */ 578#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index dde85ef1c56d..48e0c030e8f5 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -57,7 +57,7 @@ void show_mem(unsigned int filter)
57 printk("Mem-info:\n"); 57 printk("Mem-info:\n");
58 show_free_areas(filter); 58 show_free_areas(filter);
59 printk("Free swap: %6ldkB\n", 59 printk("Free swap: %6ldkB\n",
60 nr_swap_pages << (PAGE_SHIFT-10)); 60 get_nr_swap_pages() << (PAGE_SHIFT-10));
61 printk("%ld pages of RAM\n", totalram_pages); 61 printk("%ld pages of RAM\n", totalram_pages);
62 printk("%ld free pages\n", nr_free_pages()); 62 printk("%ld free pages\n", nr_free_pages());
63} 63}
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 5c2c6e61facb..1588d33d5492 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2235,6 +2235,11 @@ void __meminit vmemmap_populate_print_last(void)
2235 node_start = 0; 2235 node_start = 0;
2236 } 2236 }
2237} 2237}
2238
2239void vmemmap_free(struct page *memmap, unsigned long nr_pages)
2240{
2241}
2242
2238#endif /* CONFIG_SPARSEMEM_VMEMMAP */ 2243#endif /* CONFIG_SPARSEMEM_VMEMMAP */
2239 2244
2240static void prot_init_common(unsigned long page_none, 2245static void prot_init_common(unsigned long page_none,
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 3cfa98bf9125..743c951c61b0 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -130,7 +130,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm,
130 if (!retval) { 130 if (!retval) {
131 unsigned long addr = MEM_USER_INTRPT; 131 unsigned long addr = MEM_USER_INTRPT;
132 addr = mmap_region(NULL, addr, INTRPT_SIZE, 132 addr = mmap_region(NULL, addr, INTRPT_SIZE,
133 MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE,
134 VM_READ|VM_EXEC| 133 VM_READ|VM_EXEC|
135 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0); 134 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0);
136 if (addr > (unsigned long) -PAGE_SIZE) 135 if (addr > (unsigned long) -PAGE_SIZE)
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index ef29d6c5e10e..2749515a0547 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size)
935{ 935{
936 return -EINVAL; 936 return -EINVAL;
937} 937}
938
939#ifdef CONFIG_MEMORY_HOTREMOVE
940int arch_remove_memory(u64 start, u64 size)
941{
942 /* TODO */
943 return -EBUSY;
944}
945#endif
938#endif 946#endif
939 947
940struct kmem_cache *pgd_cache; 948struct kmem_cache *pgd_cache;
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index de0de0c0e8a1..b3b4972c2451 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -61,7 +61,7 @@ void show_mem(unsigned int filter)
61 global_page_state(NR_PAGETABLE), 61 global_page_state(NR_PAGETABLE),
62 global_page_state(NR_BOUNCE), 62 global_page_state(NR_BOUNCE),
63 global_page_state(NR_FILE_PAGES), 63 global_page_state(NR_FILE_PAGES),
64 nr_swap_pages); 64 get_nr_swap_pages());
65 65
66 for_each_zone(zone) { 66 for_each_zone(zone) {
67 unsigned long flags, order, total = 0, largest_order = -1; 67 unsigned long flags, order, total = 0, largest_order = -1;
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 52560a2038e1..1b99ee5c9f00 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -57,8 +57,8 @@ static inline int numa_cpu_node(int cpu)
57#endif 57#endif
58 58
59#ifdef CONFIG_NUMA 59#ifdef CONFIG_NUMA
60extern void __cpuinit numa_set_node(int cpu, int node); 60extern void numa_set_node(int cpu, int node);
61extern void __cpuinit numa_clear_node(int cpu); 61extern void numa_clear_node(int cpu);
62extern void __init init_cpu_to_node(void); 62extern void __init init_cpu_to_node(void);
63extern void __cpuinit numa_add_cpu(int cpu); 63extern void __cpuinit numa_add_cpu(int cpu);
64extern void __cpuinit numa_remove_cpu(int cpu); 64extern void __cpuinit numa_remove_cpu(int cpu);
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index e6423002c10b..567b5d0632b2 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -351,6 +351,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
351 * as a pte too. 351 * as a pte too.
352 */ 352 */
353extern pte_t *lookup_address(unsigned long address, unsigned int *level); 353extern pte_t *lookup_address(unsigned long address, unsigned int *level);
354extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
354extern phys_addr_t slow_virt_to_phys(void *__address); 355extern phys_addr_t slow_virt_to_phys(void *__address);
355 356
356#endif /* !__ASSEMBLY__ */ 357#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index cfc755dc1607..230c8ea878e5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -696,6 +696,10 @@ EXPORT_SYMBOL(acpi_map_lsapic);
696 696
697int acpi_unmap_lsapic(int cpu) 697int acpi_unmap_lsapic(int cpu)
698{ 698{
699#ifdef CONFIG_ACPI_NUMA
700 set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
701#endif
702
699 per_cpu(x86_cpu_to_apicid, cpu) = -1; 703 per_cpu(x86_cpu_to_apicid, cpu) = -1;
700 set_cpu_present(cpu, false); 704 set_cpu_present(cpu, false);
701 num_processors--; 705 num_processors--;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 915f5efefcf5..9c857f05cef0 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1056,6 +1056,15 @@ void __init setup_arch(char **cmdline_p)
1056 setup_bios_corruption_check(); 1056 setup_bios_corruption_check();
1057#endif 1057#endif
1058 1058
1059 /*
1060 * In the memory hotplug case, the kernel needs info from SRAT to
1061 * determine which memory is hotpluggable before allocating memory
1062 * using memblock.
1063 */
1064 acpi_boot_table_init();
1065 early_acpi_boot_init();
1066 early_parse_srat();
1067
1059#ifdef CONFIG_X86_32 1068#ifdef CONFIG_X86_32
1060 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", 1069 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n",
1061 (max_pfn_mapped<<PAGE_SHIFT) - 1); 1070 (max_pfn_mapped<<PAGE_SHIFT) - 1);
@@ -1101,10 +1110,6 @@ void __init setup_arch(char **cmdline_p)
1101 /* 1110 /*
1102 * Parse the ACPI tables for possible boot-time SMP configuration. 1111 * Parse the ACPI tables for possible boot-time SMP configuration.
1103 */ 1112 */
1104 acpi_boot_table_init();
1105
1106 early_acpi_boot_init();
1107
1108 initmem_init(); 1113 initmem_init();
1109 memblock_find_dma_reserve(); 1114 memblock_find_dma_reserve();
1110 1115
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index b299724f6e34..2d19001151d5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -862,6 +862,18 @@ int arch_add_memory(int nid, u64 start, u64 size)
862 862
863 return __add_pages(nid, zone, start_pfn, nr_pages); 863 return __add_pages(nid, zone, start_pfn, nr_pages);
864} 864}
865
866#ifdef CONFIG_MEMORY_HOTREMOVE
867int arch_remove_memory(u64 start, u64 size)
868{
869 unsigned long start_pfn = start >> PAGE_SHIFT;
870 unsigned long nr_pages = size >> PAGE_SHIFT;
871 struct zone *zone;
872
873 zone = page_zone(pfn_to_page(start_pfn));
874 return __remove_pages(zone, start_pfn, nr_pages);
875}
876#endif
865#endif 877#endif
866 878
867/* 879/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3eba7f429880..474e28f10815 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -707,6 +707,343 @@ int arch_add_memory(int nid, u64 start, u64 size)
707} 707}
708EXPORT_SYMBOL_GPL(arch_add_memory); 708EXPORT_SYMBOL_GPL(arch_add_memory);
709 709
710#define PAGE_INUSE 0xFD
711
712static void __meminit free_pagetable(struct page *page, int order)
713{
714 struct zone *zone;
715 bool bootmem = false;
716 unsigned long magic;
717 unsigned int nr_pages = 1 << order;
718
719 /* bootmem page has reserved flag */
720 if (PageReserved(page)) {
721 __ClearPageReserved(page);
722 bootmem = true;
723
724 magic = (unsigned long)page->lru.next;
725 if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) {
726 while (nr_pages--)
727 put_page_bootmem(page++);
728 } else
729 __free_pages_bootmem(page, order);
730 } else
731 free_pages((unsigned long)page_address(page), order);
732
733 /*
734 * SECTION_INFO pages and MIX_SECTION_INFO pages
735 * are all allocated by bootmem.
736 */
737 if (bootmem) {
738 zone = page_zone(page);
739 zone_span_writelock(zone);
740 zone->present_pages += nr_pages;
741 zone_span_writeunlock(zone);
742 totalram_pages += nr_pages;
743 }
744}
745
746static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
747{
748 pte_t *pte;
749 int i;
750
751 for (i = 0; i < PTRS_PER_PTE; i++) {
752 pte = pte_start + i;
753 if (pte_val(*pte))
754 return;
755 }
756
757 /* free a pte talbe */
758 free_pagetable(pmd_page(*pmd), 0);
759 spin_lock(&init_mm.page_table_lock);
760 pmd_clear(pmd);
761 spin_unlock(&init_mm.page_table_lock);
762}
763
764static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
765{
766 pmd_t *pmd;
767 int i;
768
769 for (i = 0; i < PTRS_PER_PMD; i++) {
770 pmd = pmd_start + i;
771 if (pmd_val(*pmd))
772 return;
773 }
774
775 /* free a pmd talbe */
776 free_pagetable(pud_page(*pud), 0);
777 spin_lock(&init_mm.page_table_lock);
778 pud_clear(pud);
779 spin_unlock(&init_mm.page_table_lock);
780}
781
782/* Return true if pgd is changed, otherwise return false. */
783static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
784{
785 pud_t *pud;
786 int i;
787
788 for (i = 0; i < PTRS_PER_PUD; i++) {
789 pud = pud_start + i;
790 if (pud_val(*pud))
791 return false;
792 }
793
794 /* free a pud table */
795 free_pagetable(pgd_page(*pgd), 0);
796 spin_lock(&init_mm.page_table_lock);
797 pgd_clear(pgd);
798 spin_unlock(&init_mm.page_table_lock);
799
800 return true;
801}
802
803static void __meminit
804remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
805 bool direct)
806{
807 unsigned long next, pages = 0;
808 pte_t *pte;
809 void *page_addr;
810 phys_addr_t phys_addr;
811
812 pte = pte_start + pte_index(addr);
813 for (; addr < end; addr = next, pte++) {
814 next = (addr + PAGE_SIZE) & PAGE_MASK;
815 if (next > end)
816 next = end;
817
818 if (!pte_present(*pte))
819 continue;
820
821 /*
822 * We mapped [0,1G) memory as identity mapping when
823 * initializing, in arch/x86/kernel/head_64.S. These
824 * pagetables cannot be removed.
825 */
826 phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
827 if (phys_addr < (phys_addr_t)0x40000000)
828 return;
829
830 if (IS_ALIGNED(addr, PAGE_SIZE) &&
831 IS_ALIGNED(next, PAGE_SIZE)) {
832 /*
833 * Do not free direct mapping pages since they were
834 * freed when offlining, or simplely not in use.
835 */
836 if (!direct)
837 free_pagetable(pte_page(*pte), 0);
838
839 spin_lock(&init_mm.page_table_lock);
840 pte_clear(&init_mm, addr, pte);
841 spin_unlock(&init_mm.page_table_lock);
842
843 /* For non-direct mapping, pages means nothing. */
844 pages++;
845 } else {
846 /*
847 * If we are here, we are freeing vmemmap pages since
848 * direct mapped memory ranges to be freed are aligned.
849 *
850 * If we are not removing the whole page, it means
851 * other page structs in this page are being used and
852 * we canot remove them. So fill the unused page_structs
853 * with 0xFD, and remove the page when it is wholly
854 * filled with 0xFD.
855 */
856 memset((void *)addr, PAGE_INUSE, next - addr);
857
858 page_addr = page_address(pte_page(*pte));
859 if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
860 free_pagetable(pte_page(*pte), 0);
861
862 spin_lock(&init_mm.page_table_lock);
863 pte_clear(&init_mm, addr, pte);
864 spin_unlock(&init_mm.page_table_lock);
865 }
866 }
867 }
868
869 /* Call free_pte_table() in remove_pmd_table(). */
870 flush_tlb_all();
871 if (direct)
872 update_page_count(PG_LEVEL_4K, -pages);
873}
874
875static void __meminit
876remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
877 bool direct)
878{
879 unsigned long next, pages = 0;
880 pte_t *pte_base;
881 pmd_t *pmd;
882 void *page_addr;
883
884 pmd = pmd_start + pmd_index(addr);
885 for (; addr < end; addr = next, pmd++) {
886 next = pmd_addr_end(addr, end);
887
888 if (!pmd_present(*pmd))
889 continue;
890
891 if (pmd_large(*pmd)) {
892 if (IS_ALIGNED(addr, PMD_SIZE) &&
893 IS_ALIGNED(next, PMD_SIZE)) {
894 if (!direct)
895 free_pagetable(pmd_page(*pmd),
896 get_order(PMD_SIZE));
897
898 spin_lock(&init_mm.page_table_lock);
899 pmd_clear(pmd);
900 spin_unlock(&init_mm.page_table_lock);
901 pages++;
902 } else {
903 /* If here, we are freeing vmemmap pages. */
904 memset((void *)addr, PAGE_INUSE, next - addr);
905
906 page_addr = page_address(pmd_page(*pmd));
907 if (!memchr_inv(page_addr, PAGE_INUSE,
908 PMD_SIZE)) {
909 free_pagetable(pmd_page(*pmd),
910 get_order(PMD_SIZE));
911
912 spin_lock(&init_mm.page_table_lock);
913 pmd_clear(pmd);
914 spin_unlock(&init_mm.page_table_lock);
915 }
916 }
917
918 continue;
919 }
920
921 pte_base = (pte_t *)pmd_page_vaddr(*pmd);
922 remove_pte_table(pte_base, addr, next, direct);
923 free_pte_table(pte_base, pmd);
924 }
925
926 /* Call free_pmd_table() in remove_pud_table(). */
927 if (direct)
928 update_page_count(PG_LEVEL_2M, -pages);
929}
930
931static void __meminit
932remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
933 bool direct)
934{
935 unsigned long next, pages = 0;
936 pmd_t *pmd_base;
937 pud_t *pud;
938 void *page_addr;
939
940 pud = pud_start + pud_index(addr);
941 for (; addr < end; addr = next, pud++) {
942 next = pud_addr_end(addr, end);
943
944 if (!pud_present(*pud))
945 continue;
946
947 if (pud_large(*pud)) {
948 if (IS_ALIGNED(addr, PUD_SIZE) &&
949 IS_ALIGNED(next, PUD_SIZE)) {
950 if (!direct)
951 free_pagetable(pud_page(*pud),
952 get_order(PUD_SIZE));
953
954 spin_lock(&init_mm.page_table_lock);
955 pud_clear(pud);
956 spin_unlock(&init_mm.page_table_lock);
957 pages++;
958 } else {
959 /* If here, we are freeing vmemmap pages. */
960 memset((void *)addr, PAGE_INUSE, next - addr);
961
962 page_addr = page_address(pud_page(*pud));
963 if (!memchr_inv(page_addr, PAGE_INUSE,
964 PUD_SIZE)) {
965 free_pagetable(pud_page(*pud),
966 get_order(PUD_SIZE));
967
968 spin_lock(&init_mm.page_table_lock);
969 pud_clear(pud);
970 spin_unlock(&init_mm.page_table_lock);
971 }
972 }
973
974 continue;
975 }
976
977 pmd_base = (pmd_t *)pud_page_vaddr(*pud);
978 remove_pmd_table(pmd_base, addr, next, direct);
979 free_pmd_table(pmd_base, pud);
980 }
981
982 if (direct)
983 update_page_count(PG_LEVEL_1G, -pages);
984}
985
986/* start and end are both virtual address. */
987static void __meminit
988remove_pagetable(unsigned long start, unsigned long end, bool direct)
989{
990 unsigned long next;
991 pgd_t *pgd;
992 pud_t *pud;
993 bool pgd_changed = false;
994
995 for (; start < end; start = next) {
996 next = pgd_addr_end(start, end);
997
998 pgd = pgd_offset_k(start);
999 if (!pgd_present(*pgd))
1000 continue;
1001
1002 pud = (pud_t *)pgd_page_vaddr(*pgd);
1003 remove_pud_table(pud, start, next, direct);
1004 if (free_pud_table(pud, pgd))
1005 pgd_changed = true;
1006 }
1007
1008 if (pgd_changed)
1009 sync_global_pgds(start, end - 1);
1010
1011 flush_tlb_all();
1012}
1013
1014void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages)
1015{
1016 unsigned long start = (unsigned long)memmap;
1017 unsigned long end = (unsigned long)(memmap + nr_pages);
1018
1019 remove_pagetable(start, end, false);
1020}
1021
1022static void __meminit
1023kernel_physical_mapping_remove(unsigned long start, unsigned long end)
1024{
1025 start = (unsigned long)__va(start);
1026 end = (unsigned long)__va(end);
1027
1028 remove_pagetable(start, end, true);
1029}
1030
1031#ifdef CONFIG_MEMORY_HOTREMOVE
1032int __ref arch_remove_memory(u64 start, u64 size)
1033{
1034 unsigned long start_pfn = start >> PAGE_SHIFT;
1035 unsigned long nr_pages = size >> PAGE_SHIFT;
1036 struct zone *zone;
1037 int ret;
1038
1039 zone = page_zone(pfn_to_page(start_pfn));
1040 kernel_physical_mapping_remove(start, start + size);
1041 ret = __remove_pages(zone, start_pfn, nr_pages);
1042 WARN_ON_ONCE(ret);
1043
1044 return ret;
1045}
1046#endif
710#endif /* CONFIG_MEMORY_HOTPLUG */ 1047#endif /* CONFIG_MEMORY_HOTPLUG */
711 1048
712static struct kcore_list kcore_vsyscall; 1049static struct kcore_list kcore_vsyscall;
@@ -1019,6 +1356,66 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
1019 return 0; 1356 return 0;
1020} 1357}
1021 1358
1359#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
1360void register_page_bootmem_memmap(unsigned long section_nr,
1361 struct page *start_page, unsigned long size)
1362{
1363 unsigned long addr = (unsigned long)start_page;
1364 unsigned long end = (unsigned long)(start_page + size);
1365 unsigned long next;
1366 pgd_t *pgd;
1367 pud_t *pud;
1368 pmd_t *pmd;
1369 unsigned int nr_pages;
1370 struct page *page;
1371
1372 for (; addr < end; addr = next) {
1373 pte_t *pte = NULL;
1374
1375 pgd = pgd_offset_k(addr);
1376 if (pgd_none(*pgd)) {
1377 next = (addr + PAGE_SIZE) & PAGE_MASK;
1378 continue;
1379 }
1380 get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
1381
1382 pud = pud_offset(pgd, addr);
1383 if (pud_none(*pud)) {
1384 next = (addr + PAGE_SIZE) & PAGE_MASK;
1385 continue;
1386 }
1387 get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
1388
1389 if (!cpu_has_pse) {
1390 next = (addr + PAGE_SIZE) & PAGE_MASK;
1391 pmd = pmd_offset(pud, addr);
1392 if (pmd_none(*pmd))
1393 continue;
1394 get_page_bootmem(section_nr, pmd_page(*pmd),
1395 MIX_SECTION_INFO);
1396
1397 pte = pte_offset_kernel(pmd, addr);
1398 if (pte_none(*pte))
1399 continue;
1400 get_page_bootmem(section_nr, pte_page(*pte),
1401 SECTION_INFO);
1402 } else {
1403 next = pmd_addr_end(addr, end);
1404
1405 pmd = pmd_offset(pud, addr);
1406 if (pmd_none(*pmd))
1407 continue;
1408
1409 nr_pages = 1 << (get_order(PMD_SIZE));
1410 page = pmd_page(*pmd);
1411 while (nr_pages--)
1412 get_page_bootmem(section_nr, page++,
1413 SECTION_INFO);
1414 }
1415 }
1416}
1417#endif
1418
1022void __meminit vmemmap_populate_print_last(void) 1419void __meminit vmemmap_populate_print_last(void)
1023{ 1420{
1024 if (p_start) { 1421 if (p_start) {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 8504f3698753..dfd30259eb89 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -56,7 +56,7 @@ early_param("numa", numa_setup);
56/* 56/*
57 * apicid, cpu, node mappings 57 * apicid, cpu, node mappings
58 */ 58 */
59s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 59s16 __apicid_to_node[MAX_LOCAL_APIC] = {
60 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 60 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
61}; 61};
62 62
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map);
78DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); 78DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
79EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); 79EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
80 80
81void __cpuinit numa_set_node(int cpu, int node) 81void numa_set_node(int cpu, int node)
82{ 82{
83 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); 83 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
84 84
@@ -101,7 +101,7 @@ void __cpuinit numa_set_node(int cpu, int node)
101 set_cpu_numa_node(cpu, node); 101 set_cpu_numa_node(cpu, node);
102} 102}
103 103
104void __cpuinit numa_clear_node(int cpu) 104void numa_clear_node(int cpu)
105{ 105{
106 numa_set_node(cpu, NUMA_NO_NODE); 106 numa_set_node(cpu, NUMA_NO_NODE);
107} 107}
@@ -213,10 +213,9 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
213 * Allocate node data. Try node-local memory and then any node. 213 * Allocate node data. Try node-local memory and then any node.
214 * Never allocate in DMA zone. 214 * Never allocate in DMA zone.
215 */ 215 */
216 nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); 216 nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
217 if (!nd_pa) { 217 if (!nd_pa) {
218 pr_err("Cannot find %zu bytes in node %d\n", 218 pr_err("Cannot find %zu bytes in any node\n", nd_size);
219 nd_size, nid);
220 return; 219 return;
221 } 220 }
222 nd = __va(nd_pa); 221 nd = __va(nd_pa);
@@ -561,10 +560,12 @@ static int __init numa_init(int (*init_func)(void))
561 for (i = 0; i < MAX_LOCAL_APIC; i++) 560 for (i = 0; i < MAX_LOCAL_APIC; i++)
562 set_apicid_to_node(i, NUMA_NO_NODE); 561 set_apicid_to_node(i, NUMA_NO_NODE);
563 562
564 nodes_clear(numa_nodes_parsed); 563 /*
564 * Do not clear numa_nodes_parsed or zero numa_meminfo here, because
565 * SRAT was parsed earlier in early_parse_srat().
566 */
565 nodes_clear(node_possible_map); 567 nodes_clear(node_possible_map);
566 nodes_clear(node_online_map); 568 nodes_clear(node_online_map);
567 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
568 WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); 569 WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
569 numa_reset_distance(); 570 numa_reset_distance();
570 571
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a1b1c88f9caf..ca1f1c2bb7be 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -529,21 +529,13 @@ out_unlock:
529 return do_split; 529 return do_split;
530} 530}
531 531
532static int split_large_page(pte_t *kpte, unsigned long address) 532int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
533{ 533{
534 unsigned long pfn, pfninc = 1; 534 unsigned long pfn, pfninc = 1;
535 unsigned int i, level; 535 unsigned int i, level;
536 pte_t *pbase, *tmp; 536 pte_t *tmp;
537 pgprot_t ref_prot; 537 pgprot_t ref_prot;
538 struct page *base; 538 struct page *base = virt_to_page(pbase);
539
540 if (!debug_pagealloc)
541 spin_unlock(&cpa_lock);
542 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
543 if (!debug_pagealloc)
544 spin_lock(&cpa_lock);
545 if (!base)
546 return -ENOMEM;
547 539
548 spin_lock(&pgd_lock); 540 spin_lock(&pgd_lock);
549 /* 541 /*
@@ -551,10 +543,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
551 * up for us already: 543 * up for us already:
552 */ 544 */
553 tmp = lookup_address(address, &level); 545 tmp = lookup_address(address, &level);
554 if (tmp != kpte) 546 if (tmp != kpte) {
555 goto out_unlock; 547 spin_unlock(&pgd_lock);
548 return 1;
549 }
556 550
557 pbase = (pte_t *)page_address(base);
558 paravirt_alloc_pte(&init_mm, page_to_pfn(base)); 551 paravirt_alloc_pte(&init_mm, page_to_pfn(base));
559 ref_prot = pte_pgprot(pte_clrhuge(*kpte)); 552 ref_prot = pte_pgprot(pte_clrhuge(*kpte));
560 /* 553 /*
@@ -601,17 +594,27 @@ static int split_large_page(pte_t *kpte, unsigned long address)
601 * going on. 594 * going on.
602 */ 595 */
603 __flush_tlb_all(); 596 __flush_tlb_all();
597 spin_unlock(&pgd_lock);
604 598
605 base = NULL; 599 return 0;
600}
606 601
607out_unlock: 602static int split_large_page(pte_t *kpte, unsigned long address)
608 /* 603{
609 * If we dropped out via the lookup_address check under 604 pte_t *pbase;
610 * pgd_lock then stick the page back into the pool: 605 struct page *base;
611 */ 606
612 if (base) 607 if (!debug_pagealloc)
608 spin_unlock(&cpa_lock);
609 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
610 if (!debug_pagealloc)
611 spin_lock(&cpa_lock);
612 if (!base)
613 return -ENOMEM;
614
615 pbase = (pte_t *)page_address(base);
616 if (__split_large_page(kpte, address, pbase))
613 __free_page(base); 617 __free_page(base);
614 spin_unlock(&pgd_lock);
615 618
616 return 0; 619 return 0;
617} 620}
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index cdd0da9dd530..79836d01f789 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -141,11 +141,126 @@ static inline int save_add_info(void) {return 1;}
141static inline int save_add_info(void) {return 0;} 141static inline int save_add_info(void) {return 0;}
142#endif 142#endif
143 143
144#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
145static void __init
146handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
147{
148 int overlap, i;
149 unsigned long start_pfn, end_pfn;
150
151 start_pfn = PFN_DOWN(start);
152 end_pfn = PFN_UP(end);
153
154 /*
155 * For movablemem_map=acpi:
156 *
157 * SRAT: |_____| |_____| |_________| |_________| ......
158 * node id: 0 1 1 2
159 * hotpluggable: n y y n
160 * movablemem_map: |_____| |_________|
161 *
162 * Using movablemem_map, we can prevent memblock from allocating memory
163 * on ZONE_MOVABLE at boot time.
164 *
165 * Before parsing SRAT, memblock has already reserve some memory ranges
166 * for other purposes, such as for kernel image. We cannot prevent
167 * kernel from using these memory, so we need to exclude these memory
168 * even if it is hotpluggable.
169 * Furthermore, to ensure the kernel has enough memory to boot, we make
170 * all the memory on the node which the kernel resides in
171 * un-hotpluggable.
172 */
173 if (hotpluggable && movablemem_map.acpi) {
174 /* Exclude ranges reserved by memblock. */
175 struct memblock_type *rgn = &memblock.reserved;
176
177 for (i = 0; i < rgn->cnt; i++) {
178 if (end <= rgn->regions[i].base ||
179 start >= rgn->regions[i].base +
180 rgn->regions[i].size)
181 continue;
182
183 /*
184 * If the memory range overlaps the memory reserved by
185 * memblock, then the kernel resides in this node.
186 */
187 node_set(node, movablemem_map.numa_nodes_kernel);
188
189 goto out;
190 }
191
192 /*
193 * If the kernel resides in this node, then the whole node
194 * should not be hotpluggable.
195 */
196 if (node_isset(node, movablemem_map.numa_nodes_kernel))
197 goto out;
198
199 insert_movablemem_map(start_pfn, end_pfn);
200
201 /*
202 * numa_nodes_hotplug nodemask represents which nodes are put
203 * into movablemem_map.map[].
204 */
205 node_set(node, movablemem_map.numa_nodes_hotplug);
206 goto out;
207 }
208
209 /*
210 * For movablemem_map=nn[KMG]@ss[KMG]:
211 *
212 * SRAT: |_____| |_____| |_________| |_________| ......
213 * node id: 0 1 1 2
214 * user specified: |__| |___|
215 * movablemem_map: |___| |_________| |______| ......
216 *
217 * Using movablemem_map, we can prevent memblock from allocating memory
218 * on ZONE_MOVABLE at boot time.
219 *
220 * NOTE: In this case, SRAT info will be ingored.
221 */
222 overlap = movablemem_map_overlap(start_pfn, end_pfn);
223 if (overlap >= 0) {
224 /*
225 * If part of this range is in movablemem_map, we need to
226 * add the range after it to extend the range to the end
227 * of the node, because from the min address specified to
228 * the end of the node will be ZONE_MOVABLE.
229 */
230 start_pfn = max(start_pfn,
231 movablemem_map.map[overlap].start_pfn);
232 insert_movablemem_map(start_pfn, end_pfn);
233
234 /*
235 * Set the nodemask, so that if the address range on one node
236 * is not continuse, we can add the subsequent ranges on the
237 * same node into movablemem_map.
238 */
239 node_set(node, movablemem_map.numa_nodes_hotplug);
240 } else {
241 if (node_isset(node, movablemem_map.numa_nodes_hotplug))
242 /*
243 * Insert the range if we already have movable ranges
244 * on the same node.
245 */
246 insert_movablemem_map(start_pfn, end_pfn);
247 }
248out:
249 return;
250}
251#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
252static inline void
253handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable)
254{
255}
256#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
257
144/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 258/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
145int __init 259int __init
146acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) 260acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
147{ 261{
148 u64 start, end; 262 u64 start, end;
263 u32 hotpluggable;
149 int node, pxm; 264 int node, pxm;
150 265
151 if (srat_disabled()) 266 if (srat_disabled())
@@ -154,7 +269,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
154 goto out_err_bad_srat; 269 goto out_err_bad_srat;
155 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) 270 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
156 goto out_err; 271 goto out_err;
157 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) 272 hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE;
273 if (hotpluggable && !save_add_info())
158 goto out_err; 274 goto out_err;
159 275
160 start = ma->base_address; 276 start = ma->base_address;
@@ -174,9 +290,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
174 290
175 node_set(node, numa_nodes_parsed); 291 node_set(node, numa_nodes_parsed);
176 292
177 printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", 293 printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n",
178 node, pxm, 294 node, pxm,
179 (unsigned long long) start, (unsigned long long) end - 1); 295 (unsigned long long) start, (unsigned long long) end - 1,
296 hotpluggable ? "Hot Pluggable": "");
297
298 handle_movablemem(node, start, end, hotpluggable);
180 299
181 return 0; 300 return 0;
182out_err_bad_srat: 301out_err_bad_srat:
diff --git a/block/genhd.c b/block/genhd.c
index 3993ebf4135f..5f73c2435fde 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,6 +18,7 @@
18#include <linux/mutex.h> 18#include <linux/mutex.h>
19#include <linux/idr.h> 19#include <linux/idr.h>
20#include <linux/log2.h> 20#include <linux/log2.h>
21#include <linux/pm_runtime.h>
21 22
22#include "blk.h" 23#include "blk.h"
23 24
@@ -534,6 +535,14 @@ static void register_disk(struct gendisk *disk)
534 return; 535 return;
535 } 536 }
536 } 537 }
538
539 /*
540 * avoid probable deadlock caused by allocating memory with
541 * GFP_KERNEL in runtime_resume callback of its all ancestor
542 * devices
543 */
544 pm_runtime_set_memalloc_noio(ddev, true);
545
537 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); 546 disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
538 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); 547 disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
539 548
@@ -663,6 +672,7 @@ void del_gendisk(struct gendisk *disk)
663 disk->driverfs_dev = NULL; 672 disk->driverfs_dev = NULL;
664 if (!sysfs_deprecated) 673 if (!sysfs_deprecated)
665 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); 674 sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
675 pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
666 device_del(disk_to_dev(disk)); 676 device_del(disk_to_dev(disk));
667} 677}
668EXPORT_SYMBOL(del_gendisk); 678EXPORT_SYMBOL(del_gendisk);
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 034d3e72aa92..da1f82b445e0 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -280,9 +280,11 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
280 280
281static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device) 281static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
282{ 282{
283 int result = 0; 283 int result = 0, nid;
284 struct acpi_memory_info *info, *n; 284 struct acpi_memory_info *info, *n;
285 285
286 nid = acpi_get_node(mem_device->device->handle);
287
286 list_for_each_entry_safe(info, n, &mem_device->res_list, list) { 288 list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
287 if (info->failed) 289 if (info->failed)
288 /* The kernel does not use this memory block */ 290 /* The kernel does not use this memory block */
@@ -295,7 +297,9 @@ static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
295 */ 297 */
296 return -EBUSY; 298 return -EBUSY;
297 299
298 result = remove_memory(info->start_addr, info->length); 300 if (nid < 0)
301 nid = memory_add_physaddr_to_nid(info->start_addr);
302 result = remove_memory(nid, info->start_addr, info->length);
299 if (result) 303 if (result)
300 return result; 304 return result;
301 305
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 33e609f63585..59844ee149be 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -282,10 +282,10 @@ acpi_table_parse_srat(enum acpi_srat_type id,
282 handler, max_entries); 282 handler, max_entries);
283} 283}
284 284
285int __init acpi_numa_init(void) 285static int srat_mem_cnt;
286{
287 int cnt = 0;
288 286
287void __init early_parse_srat(void)
288{
289 /* 289 /*
290 * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= 290 * Should not limit number with cpu num that is from NR_CPUS or nr_cpus=
291 * SRAT cpu entries could have different order with that in MADT. 291 * SRAT cpu entries could have different order with that in MADT.
@@ -295,21 +295,24 @@ int __init acpi_numa_init(void)
295 /* SRAT: Static Resource Affinity Table */ 295 /* SRAT: Static Resource Affinity Table */
296 if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { 296 if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
297 acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, 297 acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
298 acpi_parse_x2apic_affinity, 0); 298 acpi_parse_x2apic_affinity, 0);
299 acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, 299 acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
300 acpi_parse_processor_affinity, 0); 300 acpi_parse_processor_affinity, 0);
301 cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, 301 srat_mem_cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
302 acpi_parse_memory_affinity, 302 acpi_parse_memory_affinity,
303 NR_NODE_MEMBLKS); 303 NR_NODE_MEMBLKS);
304 } 304 }
305}
305 306
307int __init acpi_numa_init(void)
308{
306 /* SLIT: System Locality Information Table */ 309 /* SLIT: System Locality Information Table */
307 acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); 310 acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
308 311
309 acpi_numa_arch_fixup(); 312 acpi_numa_arch_fixup();
310 313
311 if (cnt < 0) 314 if (srat_mem_cnt < 0)
312 return cnt; 315 return srat_mem_cnt;
313 else if (!parsed_numa_memblks) 316 else if (!parsed_numa_memblks)
314 return -ENOENT; 317 return -ENOENT;
315 return 0; 318 return 0;
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index cbf1f122666b..df34bd04ae62 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -45,6 +45,7 @@
45#include <linux/cpuidle.h> 45#include <linux/cpuidle.h>
46#include <linux/slab.h> 46#include <linux/slab.h>
47#include <linux/acpi.h> 47#include <linux/acpi.h>
48#include <linux/memory_hotplug.h>
48 49
49#include <asm/io.h> 50#include <asm/io.h>
50#include <asm/cpu.h> 51#include <asm/cpu.h>
@@ -641,6 +642,7 @@ static int acpi_processor_remove(struct acpi_device *device)
641 642
642 per_cpu(processors, pr->id) = NULL; 643 per_cpu(processors, pr->id) = NULL;
643 per_cpu(processor_device_array, pr->id) = NULL; 644 per_cpu(processor_device_array, pr->id) = NULL;
645 try_offline_node(cpu_to_node(pr->id));
644 646
645free: 647free:
646 free_cpumask_var(pr->throttling.shared_cpu_map); 648 free_cpumask_var(pr->throttling.shared_cpu_map);
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 83d0b17ba1c2..a51007b79032 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -693,6 +693,12 @@ int offline_memory_block(struct memory_block *mem)
693 return ret; 693 return ret;
694} 694}
695 695
696/* return true if the memory block is offlined, otherwise, return false */
697bool is_memblock_offlined(struct memory_block *mem)
698{
699 return mem->state == MEM_OFFLINE;
700}
701
696/* 702/*
697 * Initialize the sysfs support for memory devices... 703 * Initialize the sysfs support for memory devices...
698 */ 704 */
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 3148b10dc2e5..1244930e3d7a 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -124,6 +124,76 @@ unsigned long pm_runtime_autosuspend_expiration(struct device *dev)
124} 124}
125EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration); 125EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration);
126 126
127static int dev_memalloc_noio(struct device *dev, void *data)
128{
129 return dev->power.memalloc_noio;
130}
131
132/*
133 * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag.
134 * @dev: Device to handle.
135 * @enable: True for setting the flag and False for clearing the flag.
136 *
137 * Set the flag for all devices in the path from the device to the
138 * root device in the device tree if @enable is true, otherwise clear
139 * the flag for devices in the path whose siblings don't set the flag.
140 *
141 * The function should only be called by block device, or network
142 * device driver for solving the deadlock problem during runtime
143 * resume/suspend:
144 *
145 * If memory allocation with GFP_KERNEL is called inside runtime
146 * resume/suspend callback of any one of its ancestors(or the
147 * block device itself), the deadlock may be triggered inside the
148 * memory allocation since it might not complete until the block
149 * device becomes active and the involed page I/O finishes. The
150 * situation is pointed out first by Alan Stern. Network device
151 * are involved in iSCSI kind of situation.
152 *
153 * The lock of dev_hotplug_mutex is held in the function for handling
154 * hotplug race because pm_runtime_set_memalloc_noio() may be called
155 * in async probe().
156 *
157 * The function should be called between device_add() and device_del()
158 * on the affected device(block/network device).
159 */
160void pm_runtime_set_memalloc_noio(struct device *dev, bool enable)
161{
162 static DEFINE_MUTEX(dev_hotplug_mutex);
163
164 mutex_lock(&dev_hotplug_mutex);
165 for (;;) {
166 bool enabled;
167
168 /* hold power lock since bitfield is not SMP-safe. */
169 spin_lock_irq(&dev->power.lock);
170 enabled = dev->power.memalloc_noio;
171 dev->power.memalloc_noio = enable;
172 spin_unlock_irq(&dev->power.lock);
173
174 /*
175 * not need to enable ancestors any more if the device
176 * has been enabled.
177 */
178 if (enabled && enable)
179 break;
180
181 dev = dev->parent;
182
183 /*
184 * clear flag of the parent device only if all the
185 * children don't set the flag because ancestor's
186 * flag was set by any one of the descendants.
187 */
188 if (!dev || (!enable &&
189 device_for_each_child(dev, NULL,
190 dev_memalloc_noio)))
191 break;
192 }
193 mutex_unlock(&dev_hotplug_mutex);
194}
195EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio);
196
127/** 197/**
128 * rpm_check_suspend_allowed - Test whether a device may be suspended. 198 * rpm_check_suspend_allowed - Test whether a device may be suspended.
129 * @dev: Device to test. 199 * @dev: Device to test.
@@ -278,7 +348,24 @@ static int rpm_callback(int (*cb)(struct device *), struct device *dev)
278 if (!cb) 348 if (!cb)
279 return -ENOSYS; 349 return -ENOSYS;
280 350
281 retval = __rpm_callback(cb, dev); 351 if (dev->power.memalloc_noio) {
352 unsigned int noio_flag;
353
354 /*
355 * Deadlock might be caused if memory allocation with
356 * GFP_KERNEL happens inside runtime_suspend and
357 * runtime_resume callbacks of one block device's
358 * ancestor or the block device itself. Network
359 * device might be thought as part of iSCSI block
360 * device, so network device and its ancestor should
361 * be marked as memalloc_noio too.
362 */
363 noio_flag = memalloc_noio_save();
364 retval = __rpm_callback(cb, dev);
365 memalloc_noio_restore(noio_flag);
366 } else {
367 retval = __rpm_callback(cb, dev);
368 }
282 369
283 dev->power.runtime_error = retval; 370 dev->power.runtime_error = retval;
284 return retval != -EACCES ? retval : -EIO; 371 return retval != -EACCES ? retval : -EIO;
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c
index 90723e65b081..0b5b5f619c75 100644
--- a/drivers/firmware/memmap.c
+++ b/drivers/firmware/memmap.c
@@ -21,6 +21,7 @@
21#include <linux/types.h> 21#include <linux/types.h>
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/mm.h>
24 25
25/* 26/*
26 * Data types ------------------------------------------------------------------ 27 * Data types ------------------------------------------------------------------
@@ -52,6 +53,9 @@ static ssize_t start_show(struct firmware_map_entry *entry, char *buf);
52static ssize_t end_show(struct firmware_map_entry *entry, char *buf); 53static ssize_t end_show(struct firmware_map_entry *entry, char *buf);
53static ssize_t type_show(struct firmware_map_entry *entry, char *buf); 54static ssize_t type_show(struct firmware_map_entry *entry, char *buf);
54 55
56static struct firmware_map_entry * __meminit
57firmware_map_find_entry(u64 start, u64 end, const char *type);
58
55/* 59/*
56 * Static data ----------------------------------------------------------------- 60 * Static data -----------------------------------------------------------------
57 */ 61 */
@@ -79,7 +83,52 @@ static const struct sysfs_ops memmap_attr_ops = {
79 .show = memmap_attr_show, 83 .show = memmap_attr_show,
80}; 84};
81 85
82static struct kobj_type memmap_ktype = { 86/* Firmware memory map entries. */
87static LIST_HEAD(map_entries);
88static DEFINE_SPINLOCK(map_entries_lock);
89
90/*
91 * For memory hotplug, there is no way to free memory map entries allocated
92 * by boot mem after the system is up. So when we hot-remove memory whose
93 * map entry is allocated by bootmem, we need to remember the storage and
94 * reuse it when the memory is hot-added again.
95 */
96static LIST_HEAD(map_entries_bootmem);
97static DEFINE_SPINLOCK(map_entries_bootmem_lock);
98
99
100static inline struct firmware_map_entry *
101to_memmap_entry(struct kobject *kobj)
102{
103 return container_of(kobj, struct firmware_map_entry, kobj);
104}
105
106static void __meminit release_firmware_map_entry(struct kobject *kobj)
107{
108 struct firmware_map_entry *entry = to_memmap_entry(kobj);
109
110 if (PageReserved(virt_to_page(entry))) {
111 /*
112 * Remember the storage allocated by bootmem, and reuse it when
113 * the memory is hot-added again. The entry will be added to
114 * map_entries_bootmem here, and deleted from &map_entries in
115 * firmware_map_remove_entry().
116 */
117 if (firmware_map_find_entry(entry->start, entry->end,
118 entry->type)) {
119 spin_lock(&map_entries_bootmem_lock);
120 list_add(&entry->list, &map_entries_bootmem);
121 spin_unlock(&map_entries_bootmem_lock);
122 }
123
124 return;
125 }
126
127 kfree(entry);
128}
129
130static struct kobj_type __refdata memmap_ktype = {
131 .release = release_firmware_map_entry,
83 .sysfs_ops = &memmap_attr_ops, 132 .sysfs_ops = &memmap_attr_ops,
84 .default_attrs = def_attrs, 133 .default_attrs = def_attrs,
85}; 134};
@@ -88,13 +137,6 @@ static struct kobj_type memmap_ktype = {
88 * Registration functions ------------------------------------------------------ 137 * Registration functions ------------------------------------------------------
89 */ 138 */
90 139
91/*
92 * Firmware memory map entries. No locking is needed because the
93 * firmware_map_add() and firmware_map_add_early() functions are called
94 * in firmware initialisation code in one single thread of execution.
95 */
96static LIST_HEAD(map_entries);
97
98/** 140/**
99 * firmware_map_add_entry() - Does the real work to add a firmware memmap entry. 141 * firmware_map_add_entry() - Does the real work to add a firmware memmap entry.
100 * @start: Start of the memory range. 142 * @start: Start of the memory range.
@@ -118,11 +160,25 @@ static int firmware_map_add_entry(u64 start, u64 end,
118 INIT_LIST_HEAD(&entry->list); 160 INIT_LIST_HEAD(&entry->list);
119 kobject_init(&entry->kobj, &memmap_ktype); 161 kobject_init(&entry->kobj, &memmap_ktype);
120 162
163 spin_lock(&map_entries_lock);
121 list_add_tail(&entry->list, &map_entries); 164 list_add_tail(&entry->list, &map_entries);
165 spin_unlock(&map_entries_lock);
122 166
123 return 0; 167 return 0;
124} 168}
125 169
170/**
171 * firmware_map_remove_entry() - Does the real work to remove a firmware
172 * memmap entry.
173 * @entry: removed entry.
174 *
175 * The caller must hold map_entries_lock, and release it properly.
176 **/
177static inline void firmware_map_remove_entry(struct firmware_map_entry *entry)
178{
179 list_del(&entry->list);
180}
181
126/* 182/*
127 * Add memmap entry on sysfs 183 * Add memmap entry on sysfs
128 */ 184 */
@@ -144,6 +200,78 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry)
144 return 0; 200 return 0;
145} 201}
146 202
203/*
204 * Remove memmap entry on sysfs
205 */
206static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry)
207{
208 kobject_put(&entry->kobj);
209}
210
211/*
212 * firmware_map_find_entry_in_list() - Search memmap entry in a given list.
213 * @start: Start of the memory range.
214 * @end: End of the memory range (exclusive).
215 * @type: Type of the memory range.
216 * @list: In which to find the entry.
217 *
218 * This function is to find the memmap entey of a given memory range in a
219 * given list. The caller must hold map_entries_lock, and must not release
220 * the lock until the processing of the returned entry has completed.
221 *
222 * Return: Pointer to the entry to be found on success, or NULL on failure.
223 */
224static struct firmware_map_entry * __meminit
225firmware_map_find_entry_in_list(u64 start, u64 end, const char *type,
226 struct list_head *list)
227{
228 struct firmware_map_entry *entry;
229
230 list_for_each_entry(entry, list, list)
231 if ((entry->start == start) && (entry->end == end) &&
232 (!strcmp(entry->type, type))) {
233 return entry;
234 }
235
236 return NULL;
237}
238
239/*
240 * firmware_map_find_entry() - Search memmap entry in map_entries.
241 * @start: Start of the memory range.
242 * @end: End of the memory range (exclusive).
243 * @type: Type of the memory range.
244 *
245 * This function is to find the memmap entey of a given memory range.
246 * The caller must hold map_entries_lock, and must not release the lock
247 * until the processing of the returned entry has completed.
248 *
249 * Return: Pointer to the entry to be found on success, or NULL on failure.
250 */
251static struct firmware_map_entry * __meminit
252firmware_map_find_entry(u64 start, u64 end, const char *type)
253{
254 return firmware_map_find_entry_in_list(start, end, type, &map_entries);
255}
256
257/*
258 * firmware_map_find_entry_bootmem() - Search memmap entry in map_entries_bootmem.
259 * @start: Start of the memory range.
260 * @end: End of the memory range (exclusive).
261 * @type: Type of the memory range.
262 *
263 * This function is similar to firmware_map_find_entry except that it find the
264 * given entry in map_entries_bootmem.
265 *
266 * Return: Pointer to the entry to be found on success, or NULL on failure.
267 */
268static struct firmware_map_entry * __meminit
269firmware_map_find_entry_bootmem(u64 start, u64 end, const char *type)
270{
271 return firmware_map_find_entry_in_list(start, end, type,
272 &map_entries_bootmem);
273}
274
147/** 275/**
148 * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do 276 * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do
149 * memory hotplug. 277 * memory hotplug.
@@ -161,9 +289,19 @@ int __meminit firmware_map_add_hotplug(u64 start, u64 end, const char *type)
161{ 289{
162 struct firmware_map_entry *entry; 290 struct firmware_map_entry *entry;
163 291
164 entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC); 292 entry = firmware_map_find_entry_bootmem(start, end, type);
165 if (!entry) 293 if (!entry) {
166 return -ENOMEM; 294 entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC);
295 if (!entry)
296 return -ENOMEM;
297 } else {
298 /* Reuse storage allocated by bootmem. */
299 spin_lock(&map_entries_bootmem_lock);
300 list_del(&entry->list);
301 spin_unlock(&map_entries_bootmem_lock);
302
303 memset(entry, 0, sizeof(*entry));
304 }
167 305
168 firmware_map_add_entry(start, end, type, entry); 306 firmware_map_add_entry(start, end, type, entry);
169 /* create the memmap entry */ 307 /* create the memmap entry */
@@ -196,6 +334,36 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type)
196 return firmware_map_add_entry(start, end, type, entry); 334 return firmware_map_add_entry(start, end, type, entry);
197} 335}
198 336
337/**
338 * firmware_map_remove() - remove a firmware mapping entry
339 * @start: Start of the memory range.
340 * @end: End of the memory range.
341 * @type: Type of the memory range.
342 *
343 * removes a firmware mapping entry.
344 *
345 * Returns 0 on success, or -EINVAL if no entry.
346 **/
347int __meminit firmware_map_remove(u64 start, u64 end, const char *type)
348{
349 struct firmware_map_entry *entry;
350
351 spin_lock(&map_entries_lock);
352 entry = firmware_map_find_entry(start, end - 1, type);
353 if (!entry) {
354 spin_unlock(&map_entries_lock);
355 return -EINVAL;
356 }
357
358 firmware_map_remove_entry(entry);
359 spin_unlock(&map_entries_lock);
360
361 /* remove the memmap entry */
362 remove_sysfs_fw_map_entry(entry);
363
364 return 0;
365}
366
199/* 367/*
200 * Sysfs functions ------------------------------------------------------------- 368 * Sysfs functions -------------------------------------------------------------
201 */ 369 */
@@ -217,8 +385,10 @@ static ssize_t type_show(struct firmware_map_entry *entry, char *buf)
217 return snprintf(buf, PAGE_SIZE, "%s\n", entry->type); 385 return snprintf(buf, PAGE_SIZE, "%s\n", entry->type);
218} 386}
219 387
220#define to_memmap_attr(_attr) container_of(_attr, struct memmap_attribute, attr) 388static inline struct memmap_attribute *to_memmap_attr(struct attribute *attr)
221#define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj) 389{
390 return container_of(attr, struct memmap_attribute, attr);
391}
222 392
223static ssize_t memmap_attr_show(struct kobject *kobj, 393static ssize_t memmap_attr_show(struct kobject *kobj,
224 struct attribute *attr, char *buf) 394 struct attribute *attr, char *buf)
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index d247a35da3c6..7b17a1fdeaf9 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -25,8 +25,8 @@ struct shadow_info {
25/* 25/*
26 * It would be nice if we scaled with the size of transaction. 26 * It would be nice if we scaled with the size of transaction.
27 */ 27 */
28#define HASH_SIZE 256 28#define DM_HASH_SIZE 256
29#define HASH_MASK (HASH_SIZE - 1) 29#define DM_HASH_MASK (DM_HASH_SIZE - 1)
30 30
31struct dm_transaction_manager { 31struct dm_transaction_manager {
32 int is_clone; 32 int is_clone;
@@ -36,7 +36,7 @@ struct dm_transaction_manager {
36 struct dm_space_map *sm; 36 struct dm_space_map *sm;
37 37
38 spinlock_t lock; 38 spinlock_t lock;
39 struct hlist_head buckets[HASH_SIZE]; 39 struct hlist_head buckets[DM_HASH_SIZE];
40}; 40};
41 41
42/*----------------------------------------------------------------*/ 42/*----------------------------------------------------------------*/
@@ -44,7 +44,7 @@ struct dm_transaction_manager {
44static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) 44static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b)
45{ 45{
46 int r = 0; 46 int r = 0;
47 unsigned bucket = dm_hash_block(b, HASH_MASK); 47 unsigned bucket = dm_hash_block(b, DM_HASH_MASK);
48 struct shadow_info *si; 48 struct shadow_info *si;
49 struct hlist_node *n; 49 struct hlist_node *n;
50 50
@@ -71,7 +71,7 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b)
71 si = kmalloc(sizeof(*si), GFP_NOIO); 71 si = kmalloc(sizeof(*si), GFP_NOIO);
72 if (si) { 72 if (si) {
73 si->where = b; 73 si->where = b;
74 bucket = dm_hash_block(b, HASH_MASK); 74 bucket = dm_hash_block(b, DM_HASH_MASK);
75 spin_lock(&tm->lock); 75 spin_lock(&tm->lock);
76 hlist_add_head(&si->hlist, tm->buckets + bucket); 76 hlist_add_head(&si->hlist, tm->buckets + bucket);
77 spin_unlock(&tm->lock); 77 spin_unlock(&tm->lock);
@@ -86,7 +86,7 @@ static void wipe_shadow_table(struct dm_transaction_manager *tm)
86 int i; 86 int i;
87 87
88 spin_lock(&tm->lock); 88 spin_lock(&tm->lock);
89 for (i = 0; i < HASH_SIZE; i++) { 89 for (i = 0; i < DM_HASH_SIZE; i++) {
90 bucket = tm->buckets + i; 90 bucket = tm->buckets + i;
91 hlist_for_each_entry_safe(si, n, tmp, bucket, hlist) 91 hlist_for_each_entry_safe(si, n, tmp, bucket, hlist)
92 kfree(si); 92 kfree(si);
@@ -115,7 +115,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm,
115 tm->sm = sm; 115 tm->sm = sm;
116 116
117 spin_lock_init(&tm->lock); 117 spin_lock_init(&tm->lock);
118 for (i = 0; i < HASH_SIZE; i++) 118 for (i = 0; i < DM_HASH_SIZE; i++)
119 INIT_HLIST_HEAD(tm->buckets + i); 119 INIT_HLIST_HEAD(tm->buckets + i);
120 120
121 return tm; 121 return tm;
diff --git a/drivers/staging/zcache/zbud.c b/drivers/staging/zcache/zbud.c
index 328c397ea5dc..fdff5c6a0239 100644
--- a/drivers/staging/zcache/zbud.c
+++ b/drivers/staging/zcache/zbud.c
@@ -404,7 +404,7 @@ static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage,
404 else 404 else
405 zbud_pers_pageframes--; 405 zbud_pers_pageframes--;
406 zbudpage_spin_unlock(zbudpage); 406 zbudpage_spin_unlock(zbudpage);
407 reset_page_mapcount(page); 407 page_mapcount_reset(page);
408 init_page_count(page); 408 init_page_count(page);
409 page->index = 0; 409 page->index = 0;
410 return page; 410 return page;
diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c
index 06f73a93a44d..e78d262c5249 100644
--- a/drivers/staging/zsmalloc/zsmalloc-main.c
+++ b/drivers/staging/zsmalloc/zsmalloc-main.c
@@ -472,7 +472,7 @@ static void reset_page(struct page *page)
472 set_page_private(page, 0); 472 set_page_private(page, 0);
473 page->mapping = NULL; 473 page->mapping = NULL;
474 page->freelist = NULL; 474 page->freelist = NULL;
475 reset_page_mapcount(page); 475 page_mapcount_reset(page);
476} 476}
477 477
478static void free_zspage(struct page *first_page) 478static void free_zspage(struct page *first_page)
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 1775ad471edd..5480352f984d 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -5177,6 +5177,7 @@ int usb_reset_device(struct usb_device *udev)
5177{ 5177{
5178 int ret; 5178 int ret;
5179 int i; 5179 int i;
5180 unsigned int noio_flag;
5180 struct usb_host_config *config = udev->actconfig; 5181 struct usb_host_config *config = udev->actconfig;
5181 5182
5182 if (udev->state == USB_STATE_NOTATTACHED || 5183 if (udev->state == USB_STATE_NOTATTACHED ||
@@ -5186,6 +5187,17 @@ int usb_reset_device(struct usb_device *udev)
5186 return -EINVAL; 5187 return -EINVAL;
5187 } 5188 }
5188 5189
5190 /*
5191 * Don't allocate memory with GFP_KERNEL in current
5192 * context to avoid possible deadlock if usb mass
5193 * storage interface or usbnet interface(iSCSI case)
5194 * is included in current configuration. The easist
5195 * approach is to do it for every device reset,
5196 * because the device 'memalloc_noio' flag may have
5197 * not been set before reseting the usb device.
5198 */
5199 noio_flag = memalloc_noio_save();
5200
5189 /* Prevent autosuspend during the reset */ 5201 /* Prevent autosuspend during the reset */
5190 usb_autoresume_device(udev); 5202 usb_autoresume_device(udev);
5191 5203
@@ -5230,6 +5242,7 @@ int usb_reset_device(struct usb_device *udev)
5230 } 5242 }
5231 5243
5232 usb_autosuspend_device(udev); 5244 usb_autosuspend_device(udev);
5245 memalloc_noio_restore(noio_flag);
5233 return ret; 5246 return ret;
5234} 5247}
5235EXPORT_SYMBOL_GPL(usb_reset_device); 5248EXPORT_SYMBOL_GPL(usb_reset_device);
diff --git a/fs/aio.c b/fs/aio.c
index 71f613cf4a85..064bfbe37566 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -101,7 +101,7 @@ static int aio_setup_ring(struct kioctx *ctx)
101 struct aio_ring *ring; 101 struct aio_ring *ring;
102 struct aio_ring_info *info = &ctx->ring_info; 102 struct aio_ring_info *info = &ctx->ring_info;
103 unsigned nr_events = ctx->max_reqs; 103 unsigned nr_events = ctx->max_reqs;
104 unsigned long size; 104 unsigned long size, populate;
105 int nr_pages; 105 int nr_pages;
106 106
107 /* Compensate for the ring buffer's head/tail overlap entry */ 107 /* Compensate for the ring buffer's head/tail overlap entry */
@@ -129,7 +129,8 @@ static int aio_setup_ring(struct kioctx *ctx)
129 down_write(&ctx->mm->mmap_sem); 129 down_write(&ctx->mm->mmap_sem);
130 info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 130 info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size,
131 PROT_READ|PROT_WRITE, 131 PROT_READ|PROT_WRITE,
132 MAP_ANONYMOUS|MAP_PRIVATE, 0); 132 MAP_ANONYMOUS|MAP_PRIVATE, 0,
133 &populate);
133 if (IS_ERR((void *)info->mmap_base)) { 134 if (IS_ERR((void *)info->mmap_base)) {
134 up_write(&ctx->mm->mmap_sem); 135 up_write(&ctx->mm->mmap_sem);
135 info->mmap_size = 0; 136 info->mmap_size = 0;
@@ -147,6 +148,8 @@ static int aio_setup_ring(struct kioctx *ctx)
147 aio_free_ring(ctx); 148 aio_free_ring(ctx);
148 return -EAGAIN; 149 return -EAGAIN;
149 } 150 }
151 if (populate)
152 mm_populate(info->mmap_base, populate);
150 153
151 ctx->user_id = info->mmap_base; 154 ctx->user_id = info->mmap_base;
152 155
diff --git a/fs/buffer.c b/fs/buffer.c
index 2ea9cd44aeae..62169c192c21 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3227,7 +3227,7 @@ static struct kmem_cache *bh_cachep __read_mostly;
3227 * Once the number of bh's in the machine exceeds this level, we start 3227 * Once the number of bh's in the machine exceeds this level, we start
3228 * stripping them in writeback. 3228 * stripping them in writeback.
3229 */ 3229 */
3230static int max_buffer_heads; 3230static unsigned long max_buffer_heads;
3231 3231
3232int buffer_heads_over_limit; 3232int buffer_heads_over_limit;
3233 3233
@@ -3343,7 +3343,7 @@ EXPORT_SYMBOL(bh_submit_read);
3343 3343
3344void __init buffer_init(void) 3344void __init buffer_init(void)
3345{ 3345{
3346 int nrpages; 3346 unsigned long nrpages;
3347 3347
3348 bh_cachep = kmem_cache_create("buffer_head", 3348 bh_cachep = kmem_cache_create("buffer_head",
3349 sizeof(struct buffer_head), 0, 3349 sizeof(struct buffer_head), 0,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index ac8ed96c4199..499e957510e7 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -151,7 +151,7 @@ get_nfs4_file(struct nfs4_file *fi)
151} 151}
152 152
153static int num_delegations; 153static int num_delegations;
154unsigned int max_delegations; 154unsigned long max_delegations;
155 155
156/* 156/*
157 * Open owner state (share locks) 157 * Open owner state (share locks)
@@ -700,8 +700,8 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num)
700 num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION); 700 num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION);
701 701
702 spin_lock(&nfsd_drc_lock); 702 spin_lock(&nfsd_drc_lock);
703 avail = min_t(int, NFSD_MAX_MEM_PER_SESSION, 703 avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION,
704 nfsd_drc_max_mem - nfsd_drc_mem_used); 704 nfsd_drc_max_mem - nfsd_drc_mem_used);
705 num = min_t(int, num, avail / slotsize); 705 num = min_t(int, num, avail / slotsize);
706 nfsd_drc_mem_used += num * slotsize; 706 nfsd_drc_mem_used += num * slotsize;
707 spin_unlock(&nfsd_drc_lock); 707 spin_unlock(&nfsd_drc_lock);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index de23db255c69..07a473fd49bc 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -56,8 +56,8 @@ extern struct svc_version nfsd_version2, nfsd_version3,
56extern u32 nfsd_supported_minorversion; 56extern u32 nfsd_supported_minorversion;
57extern struct mutex nfsd_mutex; 57extern struct mutex nfsd_mutex;
58extern spinlock_t nfsd_drc_lock; 58extern spinlock_t nfsd_drc_lock;
59extern unsigned int nfsd_drc_max_mem; 59extern unsigned long nfsd_drc_max_mem;
60extern unsigned int nfsd_drc_mem_used; 60extern unsigned long nfsd_drc_mem_used;
61 61
62extern const struct seq_operations nfs_exports_op; 62extern const struct seq_operations nfs_exports_op;
63 63
@@ -106,7 +106,7 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
106 * NFSv4 State 106 * NFSv4 State
107 */ 107 */
108#ifdef CONFIG_NFSD_V4 108#ifdef CONFIG_NFSD_V4
109extern unsigned int max_delegations; 109extern unsigned long max_delegations;
110void nfs4_state_init(void); 110void nfs4_state_init(void);
111int nfsd4_init_slabs(void); 111int nfsd4_init_slabs(void);
112void nfsd4_free_slabs(void); 112void nfsd4_free_slabs(void);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index cee62ab9d4a3..be7af509930c 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -59,8 +59,8 @@ DEFINE_MUTEX(nfsd_mutex);
59 * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. 59 * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
60 */ 60 */
61spinlock_t nfsd_drc_lock; 61spinlock_t nfsd_drc_lock;
62unsigned int nfsd_drc_max_mem; 62unsigned long nfsd_drc_max_mem;
63unsigned int nfsd_drc_mem_used; 63unsigned long nfsd_drc_mem_used;
64 64
65#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) 65#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
66static struct svc_stat nfsd_acl_svcstats; 66static struct svc_stat nfsd_acl_svcstats;
@@ -342,7 +342,7 @@ static void set_max_drc(void)
342 >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; 342 >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
343 nfsd_drc_mem_used = 0; 343 nfsd_drc_mem_used = 0;
344 spin_lock_init(&nfsd_drc_lock); 344 spin_lock_init(&nfsd_drc_lock);
345 dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem); 345 dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem);
346} 346}
347 347
348static int nfsd_get_default_max_blksize(void) 348static int nfsd_get_default_max_blksize(void)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 80e4645f7990..1efaaa19c4f3 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
40 * sysctl_overcommit_ratio / 100) + total_swap_pages; 40 * sysctl_overcommit_ratio / 100) + total_swap_pages;
41 41
42 cached = global_page_state(NR_FILE_PAGES) - 42 cached = global_page_state(NR_FILE_PAGES) -
43 total_swapcache_pages - i.bufferram; 43 total_swapcache_pages() - i.bufferram;
44 if (cached < 0) 44 if (cached < 0)
45 cached = 0; 45 cached = 0;
46 46
@@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
109 K(i.freeram), 109 K(i.freeram),
110 K(i.bufferram), 110 K(i.bufferram),
111 K(cached), 111 K(cached),
112 K(total_swapcache_pages), 112 K(total_swapcache_pages()),
113 K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), 113 K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]),
114 K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), 114 K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
115 K(pages[LRU_ACTIVE_ANON]), 115 K(pages[LRU_ACTIVE_ANON]),
@@ -158,7 +158,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
158 vmi.used >> 10, 158 vmi.used >> 10,
159 vmi.largest_chunk >> 10 159 vmi.largest_chunk >> 10
160#ifdef CONFIG_MEMORY_FAILURE 160#ifdef CONFIG_MEMORY_FAILURE
161 ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) 161 ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
162#endif 162#endif
163#ifdef CONFIG_TRANSPARENT_HUGEPAGE 163#ifdef CONFIG_TRANSPARENT_HUGEPAGE
164 ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * 164 ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index bcbdd7484e58..f46cfd73a553 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -485,6 +485,14 @@ static inline bool acpi_driver_match_device(struct device *dev,
485 485
486#endif /* !CONFIG_ACPI */ 486#endif /* !CONFIG_ACPI */
487 487
488#ifdef CONFIG_ACPI_NUMA
489void __init early_parse_srat(void);
490#else
491static inline void early_parse_srat(void)
492{
493}
494#endif
495
488#ifdef CONFIG_ACPI 496#ifdef CONFIG_ACPI
489void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, 497void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state,
490 u32 pm1a_ctrl, u32 pm1b_ctrl)); 498 u32 pm1a_ctrl, u32 pm1b_ctrl));
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 3cd16ba82f15..cdc3bab01832 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
53 unsigned long size); 53 unsigned long size);
54extern void free_bootmem(unsigned long physaddr, unsigned long size); 54extern void free_bootmem(unsigned long physaddr, unsigned long size);
55extern void free_bootmem_late(unsigned long physaddr, unsigned long size); 55extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
56extern void __free_pages_bootmem(struct page *page, unsigned int order);
56 57
57/* 58/*
58 * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, 59 * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index cc7bddeaf553..091d72e70d8a 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -23,7 +23,7 @@ extern int fragmentation_index(struct zone *zone, unsigned int order);
23extern unsigned long try_to_compact_pages(struct zonelist *zonelist, 23extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
24 int order, gfp_t gfp_mask, nodemask_t *mask, 24 int order, gfp_t gfp_mask, nodemask_t *mask,
25 bool sync, bool *contended); 25 bool sync, bool *contended);
26extern int compact_pgdat(pg_data_t *pgdat, int order); 26extern void compact_pgdat(pg_data_t *pgdat, int order);
27extern void reset_isolation_suitable(pg_data_t *pgdat); 27extern void reset_isolation_suitable(pg_data_t *pgdat);
28extern unsigned long compaction_suitable(struct zone *zone, int order); 28extern unsigned long compaction_suitable(struct zone *zone, int order);
29 29
@@ -80,9 +80,8 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
80 return COMPACT_CONTINUE; 80 return COMPACT_CONTINUE;
81} 81}
82 82
83static inline int compact_pgdat(pg_data_t *pgdat, int order) 83static inline void compact_pgdat(pg_data_t *pgdat, int order)
84{ 84{
85 return COMPACT_CONTINUE;
86} 85}
87 86
88static inline void reset_isolation_suitable(pg_data_t *pgdat) 87static inline void reset_isolation_suitable(pg_data_t *pgdat)
diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h
index 43fe52fcef0f..71d4fa721db9 100644
--- a/include/linux/firmware-map.h
+++ b/include/linux/firmware-map.h
@@ -25,6 +25,7 @@
25 25
26int firmware_map_add_early(u64 start, u64 end, const char *type); 26int firmware_map_add_early(u64 start, u64 end, const char *type);
27int firmware_map_add_hotplug(u64 start, u64 end, const char *type); 27int firmware_map_add_hotplug(u64 start, u64 end, const char *type);
28int firmware_map_remove(u64 start, u64 end, const char *type);
28 29
29#else /* CONFIG_FIRMWARE_MEMMAP */ 30#else /* CONFIG_FIRMWARE_MEMMAP */
30 31
@@ -38,6 +39,11 @@ static inline int firmware_map_add_hotplug(u64 start, u64 end, const char *type)
38 return 0; 39 return 0;
39} 40}
40 41
42static inline int firmware_map_remove(u64 start, u64 end, const char *type)
43{
44 return 0;
45}
46
41#endif /* CONFIG_FIRMWARE_MEMMAP */ 47#endif /* CONFIG_FIRMWARE_MEMMAP */
42 48
43#endif /* _LINUX_FIRMWARE_MAP_H */ 49#endif /* _LINUX_FIRMWARE_MAP_H */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index ef788b5b4a35..7fb31da45d03 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -219,12 +219,6 @@ static inline void zero_user(struct page *page,
219 zero_user_segments(page, start, start + size, 0, 0); 219 zero_user_segments(page, start, start + size, 0, 0);
220} 220}
221 221
222static inline void __deprecated memclear_highpage_flush(struct page *page,
223 unsigned int offset, unsigned int size)
224{
225 zero_user(page, offset, size);
226}
227
228#ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE 222#ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE
229 223
230static inline void copy_user_highpage(struct page *to, struct page *from, 224static inline void copy_user_highpage(struct page *to, struct page *from,
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1d76f8ca90f0..ee1c244a62a1 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -113,7 +113,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma,
113 do { \ 113 do { \
114 pmd_t *____pmd = (__pmd); \ 114 pmd_t *____pmd = (__pmd); \
115 anon_vma_lock_write(__anon_vma); \ 115 anon_vma_lock_write(__anon_vma); \
116 anon_vma_unlock(__anon_vma); \ 116 anon_vma_unlock_write(__anon_vma); \
117 BUG_ON(pmd_trans_splitting(*____pmd) || \ 117 BUG_ON(pmd_trans_splitting(*____pmd) || \
118 pmd_trans_huge(*____pmd)); \ 118 pmd_trans_huge(*____pmd)); \
119 } while (0) 119 } while (0)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 0c80d3f57a5b..eedc334fb6f5 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -43,9 +43,9 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int,
43#endif 43#endif
44 44
45int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); 45int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
46int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, 46long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
47 struct page **, struct vm_area_struct **, 47 struct page **, struct vm_area_struct **,
48 unsigned long *, int *, int, unsigned int flags); 48 unsigned long *, unsigned long *, long, unsigned int);
49void unmap_hugepage_range(struct vm_area_struct *, 49void unmap_hugepage_range(struct vm_area_struct *,
50 unsigned long, unsigned long, struct page *); 50 unsigned long, unsigned long, struct page *);
51void __unmap_hugepage_range_final(struct mmu_gather *tlb, 51void __unmap_hugepage_range_final(struct mmu_gather *tlb,
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 3319a6967626..45c9b6a17bcb 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -16,9 +16,6 @@
16struct stable_node; 16struct stable_node;
17struct mem_cgroup; 17struct mem_cgroup;
18 18
19struct page *ksm_does_need_to_copy(struct page *page,
20 struct vm_area_struct *vma, unsigned long address);
21
22#ifdef CONFIG_KSM 19#ifdef CONFIG_KSM
23int ksm_madvise(struct vm_area_struct *vma, unsigned long start, 20int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
24 unsigned long end, int advice, unsigned long *vm_flags); 21 unsigned long end, int advice, unsigned long *vm_flags);
@@ -73,15 +70,8 @@ static inline void set_page_stable_node(struct page *page,
73 * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, 70 * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
74 * but what if the vma was unmerged while the page was swapped out? 71 * but what if the vma was unmerged while the page was swapped out?
75 */ 72 */
76static inline int ksm_might_need_to_copy(struct page *page, 73struct page *ksm_might_need_to_copy(struct page *page,
77 struct vm_area_struct *vma, unsigned long address) 74 struct vm_area_struct *vma, unsigned long address);
78{
79 struct anon_vma *anon_vma = page_anon_vma(page);
80
81 return anon_vma &&
82 (anon_vma->root != vma->anon_vma->root ||
83 page->index != linear_page_index(vma, address));
84}
85 75
86int page_referenced_ksm(struct page *page, 76int page_referenced_ksm(struct page *page,
87 struct mem_cgroup *memcg, unsigned long *vm_flags); 77 struct mem_cgroup *memcg, unsigned long *vm_flags);
@@ -113,10 +103,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
113 return 0; 103 return 0;
114} 104}
115 105
116static inline int ksm_might_need_to_copy(struct page *page, 106static inline struct page *ksm_might_need_to_copy(struct page *page,
117 struct vm_area_struct *vma, unsigned long address) 107 struct vm_area_struct *vma, unsigned long address)
118{ 108{
119 return 0; 109 return page;
120} 110}
121 111
122static inline int page_referenced_ksm(struct page *page, 112static inline int page_referenced_ksm(struct page *page,
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f388203db7e8..3e5ecb2d790e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -42,6 +42,7 @@ struct memblock {
42 42
43extern struct memblock memblock; 43extern struct memblock memblock;
44extern int memblock_debug; 44extern int memblock_debug;
45extern struct movablemem_map movablemem_map;
45 46
46#define memblock_dbg(fmt, ...) \ 47#define memblock_dbg(fmt, ...) \
47 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) 48 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
@@ -60,6 +61,7 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
60void memblock_trim_memory(phys_addr_t align); 61void memblock_trim_memory(phys_addr_t align);
61 62
62#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 63#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
64
63void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, 65void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
64 unsigned long *out_end_pfn, int *out_nid); 66 unsigned long *out_end_pfn, int *out_nid);
65 67
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 28bd5fa2ff2e..d6183f06d8c1 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -116,7 +116,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
116 * For memory reclaim. 116 * For memory reclaim.
117 */ 117 */
118int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec); 118int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
119int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec);
120int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); 119int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
121unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); 120unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
122void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); 121void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
@@ -321,12 +320,6 @@ mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
321 return 1; 320 return 1;
322} 321}
323 322
324static inline int
325mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
326{
327 return 1;
328}
329
330static inline unsigned long 323static inline unsigned long
331mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 324mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
332{ 325{
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4a45c4e50025..b6a3be7d47bf 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -96,6 +96,7 @@ extern void __online_page_free(struct page *page);
96 96
97#ifdef CONFIG_MEMORY_HOTREMOVE 97#ifdef CONFIG_MEMORY_HOTREMOVE
98extern bool is_pageblock_removable_nolock(struct page *page); 98extern bool is_pageblock_removable_nolock(struct page *page);
99extern int arch_remove_memory(u64 start, u64 size);
99#endif /* CONFIG_MEMORY_HOTREMOVE */ 100#endif /* CONFIG_MEMORY_HOTREMOVE */
100 101
101/* reasonably generic interface to expand the physical pages in a zone */ 102/* reasonably generic interface to expand the physical pages in a zone */
@@ -173,17 +174,16 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
173#endif /* CONFIG_NUMA */ 174#endif /* CONFIG_NUMA */
174#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ 175#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
175 176
176#ifdef CONFIG_SPARSEMEM_VMEMMAP 177#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
178extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
179#else
177static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) 180static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
178{ 181{
179} 182}
180static inline void put_page_bootmem(struct page *page)
181{
182}
183#else
184extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
185extern void put_page_bootmem(struct page *page);
186#endif 183#endif
184extern void put_page_bootmem(struct page *page);
185extern void get_page_bootmem(unsigned long ingo, struct page *page,
186 unsigned long type);
187 187
188/* 188/*
189 * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug 189 * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug
@@ -233,6 +233,7 @@ static inline void unlock_memory_hotplug(void) {}
233#ifdef CONFIG_MEMORY_HOTREMOVE 233#ifdef CONFIG_MEMORY_HOTREMOVE
234 234
235extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); 235extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
236extern void try_offline_node(int nid);
236 237
237#else 238#else
238static inline int is_mem_section_removable(unsigned long pfn, 239static inline int is_mem_section_removable(unsigned long pfn,
@@ -240,6 +241,8 @@ static inline int is_mem_section_removable(unsigned long pfn,
240{ 241{
241 return 0; 242 return 0;
242} 243}
244
245static inline void try_offline_node(int nid) {}
243#endif /* CONFIG_MEMORY_HOTREMOVE */ 246#endif /* CONFIG_MEMORY_HOTREMOVE */
244 247
245extern int mem_online_node(int nid); 248extern int mem_online_node(int nid);
@@ -247,7 +250,8 @@ extern int add_memory(int nid, u64 start, u64 size);
247extern int arch_add_memory(int nid, u64 start, u64 size); 250extern int arch_add_memory(int nid, u64 start, u64 size);
248extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); 251extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
249extern int offline_memory_block(struct memory_block *mem); 252extern int offline_memory_block(struct memory_block *mem);
250extern int remove_memory(u64 start, u64 size); 253extern bool is_memblock_offlined(struct memory_block *mem);
254extern int remove_memory(int nid, u64 start, u64 size);
251extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, 255extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
252 int nr_pages); 256 int nr_pages);
253extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms); 257extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 1e9f627967a3..a405d3dc0f61 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -40,11 +40,9 @@ extern void putback_movable_pages(struct list_head *l);
40extern int migrate_page(struct address_space *, 40extern int migrate_page(struct address_space *,
41 struct page *, struct page *, enum migrate_mode); 41 struct page *, struct page *, enum migrate_mode);
42extern int migrate_pages(struct list_head *l, new_page_t x, 42extern int migrate_pages(struct list_head *l, new_page_t x,
43 unsigned long private, bool offlining, 43 unsigned long private, enum migrate_mode mode, int reason);
44 enum migrate_mode mode, int reason);
45extern int migrate_huge_page(struct page *, new_page_t x, 44extern int migrate_huge_page(struct page *, new_page_t x,
46 unsigned long private, bool offlining, 45 unsigned long private, enum migrate_mode mode);
47 enum migrate_mode mode);
48 46
49extern int fail_migrate_page(struct address_space *, 47extern int fail_migrate_page(struct address_space *,
50 struct page *, struct page *); 48 struct page *, struct page *);
@@ -62,11 +60,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
62static inline void putback_lru_pages(struct list_head *l) {} 60static inline void putback_lru_pages(struct list_head *l) {}
63static inline void putback_movable_pages(struct list_head *l) {} 61static inline void putback_movable_pages(struct list_head *l) {}
64static inline int migrate_pages(struct list_head *l, new_page_t x, 62static inline int migrate_pages(struct list_head *l, new_page_t x,
65 unsigned long private, bool offlining, 63 unsigned long private, enum migrate_mode mode, int reason)
66 enum migrate_mode mode, int reason) { return -ENOSYS; } 64 { return -ENOSYS; }
67static inline int migrate_huge_page(struct page *page, new_page_t x, 65static inline int migrate_huge_page(struct page *page, new_page_t x,
68 unsigned long private, bool offlining, 66 unsigned long private, enum migrate_mode mode)
69 enum migrate_mode mode) { return -ENOSYS; } 67 { return -ENOSYS; }
70 68
71static inline int migrate_prep(void) { return -ENOSYS; } 69static inline int migrate_prep(void) { return -ENOSYS; }
72static inline int migrate_prep_local(void) { return -ENOSYS; } 70static inline int migrate_prep_local(void) { return -ENOSYS; }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9d9dcc35d6a1..e7c3f9a0111a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -87,6 +87,7 @@ extern unsigned int kobjsize(const void *objp);
87#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ 87#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
88#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ 88#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
89 89
90#define VM_POPULATE 0x00001000
90#define VM_LOCKED 0x00002000 91#define VM_LOCKED 0x00002000
91#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ 92#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
92 93
@@ -366,7 +367,7 @@ static inline struct page *compound_head(struct page *page)
366 * both from it and to it can be tracked, using atomic_inc_and_test 367 * both from it and to it can be tracked, using atomic_inc_and_test
367 * and atomic_add_negative(-1). 368 * and atomic_add_negative(-1).
368 */ 369 */
369static inline void reset_page_mapcount(struct page *page) 370static inline void page_mapcount_reset(struct page *page)
370{ 371{
371 atomic_set(&(page)->_mapcount, -1); 372 atomic_set(&(page)->_mapcount, -1);
372} 373}
@@ -580,50 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
580 * sets it, so none of the operations on it need to be atomic. 581 * sets it, so none of the operations on it need to be atomic.
581 */ 582 */
582 583
583 584/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */
584/*
585 * page->flags layout:
586 *
587 * There are three possibilities for how page->flags get
588 * laid out. The first is for the normal case, without
589 * sparsemem. The second is for sparsemem when there is
590 * plenty of space for node and section. The last is when
591 * we have run out of space and have to fall back to an
592 * alternate (slower) way of determining the node.
593 *
594 * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS |
595 * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
596 * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS |
597 */
598#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
599#define SECTIONS_WIDTH SECTIONS_SHIFT
600#else
601#define SECTIONS_WIDTH 0
602#endif
603
604#define ZONES_WIDTH ZONES_SHIFT
605
606#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
607#define NODES_WIDTH NODES_SHIFT
608#else
609#ifdef CONFIG_SPARSEMEM_VMEMMAP
610#error "Vmemmap: No space for nodes field in page flags"
611#endif
612#define NODES_WIDTH 0
613#endif
614
615/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
616#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) 585#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
617#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) 586#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
618#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) 587#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
619 588#define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH)
620/*
621 * We are going to use the flags for the page to node mapping if its in
622 * there. This includes the case where there is no node, so it is implicit.
623 */
624#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
625#define NODE_NOT_IN_PAGE_FLAGS
626#endif
627 589
628/* 590/*
629 * Define the bit shifts to access each section. For non-existent 591 * Define the bit shifts to access each section. For non-existent
@@ -633,6 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
633#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) 595#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
634#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) 596#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
635#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) 597#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
598#define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0))
636 599
637/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ 600/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
638#ifdef NODE_NOT_IN_PAGE_FLAGS 601#ifdef NODE_NOT_IN_PAGE_FLAGS
@@ -654,6 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
654#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) 617#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
655#define NODES_MASK ((1UL << NODES_WIDTH) - 1) 618#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
656#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) 619#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
620#define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1)
657#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) 621#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
658 622
659static inline enum zone_type page_zonenum(const struct page *page) 623static inline enum zone_type page_zonenum(const struct page *page)
@@ -661,6 +625,10 @@ static inline enum zone_type page_zonenum(const struct page *page)
661 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; 625 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
662} 626}
663 627
628#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
629#define SECTION_IN_PAGE_FLAGS
630#endif
631
664/* 632/*
665 * The identification function is only used by the buddy allocator for 633 * The identification function is only used by the buddy allocator for
666 * determining if two pages could be buddies. We are not really 634 * determining if two pages could be buddies. We are not really
@@ -693,31 +661,48 @@ static inline int page_to_nid(const struct page *page)
693#endif 661#endif
694 662
695#ifdef CONFIG_NUMA_BALANCING 663#ifdef CONFIG_NUMA_BALANCING
696static inline int page_xchg_last_nid(struct page *page, int nid) 664#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
665static inline int page_nid_xchg_last(struct page *page, int nid)
697{ 666{
698 return xchg(&page->_last_nid, nid); 667 return xchg(&page->_last_nid, nid);
699} 668}
700 669
701static inline int page_last_nid(struct page *page) 670static inline int page_nid_last(struct page *page)
702{ 671{
703 return page->_last_nid; 672 return page->_last_nid;
704} 673}
705static inline void reset_page_last_nid(struct page *page) 674static inline void page_nid_reset_last(struct page *page)
706{ 675{
707 page->_last_nid = -1; 676 page->_last_nid = -1;
708} 677}
709#else 678#else
710static inline int page_xchg_last_nid(struct page *page, int nid) 679static inline int page_nid_last(struct page *page)
680{
681 return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK;
682}
683
684extern int page_nid_xchg_last(struct page *page, int nid);
685
686static inline void page_nid_reset_last(struct page *page)
687{
688 int nid = (1 << LAST_NID_SHIFT) - 1;
689
690 page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
691 page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
692}
693#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */
694#else
695static inline int page_nid_xchg_last(struct page *page, int nid)
711{ 696{
712 return page_to_nid(page); 697 return page_to_nid(page);
713} 698}
714 699
715static inline int page_last_nid(struct page *page) 700static inline int page_nid_last(struct page *page)
716{ 701{
717 return page_to_nid(page); 702 return page_to_nid(page);
718} 703}
719 704
720static inline void reset_page_last_nid(struct page *page) 705static inline void page_nid_reset_last(struct page *page)
721{ 706{
722} 707}
723#endif 708#endif
@@ -727,7 +712,7 @@ static inline struct zone *page_zone(const struct page *page)
727 return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; 712 return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
728} 713}
729 714
730#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 715#ifdef SECTION_IN_PAGE_FLAGS
731static inline void set_page_section(struct page *page, unsigned long section) 716static inline void set_page_section(struct page *page, unsigned long section)
732{ 717{
733 page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); 718 page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
@@ -757,7 +742,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
757{ 742{
758 set_page_zone(page, zone); 743 set_page_zone(page, zone);
759 set_page_node(page, node); 744 set_page_node(page, node);
760#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 745#ifdef SECTION_IN_PAGE_FLAGS
761 set_page_section(page, pfn_to_section_nr(pfn)); 746 set_page_section(page, pfn_to_section_nr(pfn));
762#endif 747#endif
763} 748}
@@ -817,18 +802,7 @@ void page_address_init(void);
817#define PAGE_MAPPING_KSM 2 802#define PAGE_MAPPING_KSM 2
818#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) 803#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
819 804
820extern struct address_space swapper_space; 805extern struct address_space *page_mapping(struct page *page);
821static inline struct address_space *page_mapping(struct page *page)
822{
823 struct address_space *mapping = page->mapping;
824
825 VM_BUG_ON(PageSlab(page));
826 if (unlikely(PageSwapCache(page)))
827 mapping = &swapper_space;
828 else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
829 mapping = NULL;
830 return mapping;
831}
832 806
833/* Neutral page->mapping pointer to address_space or anon_vma or other */ 807/* Neutral page->mapping pointer to address_space or anon_vma or other */
834static inline void *page_rmapping(struct page *page) 808static inline void *page_rmapping(struct page *page)
@@ -1035,18 +1009,18 @@ static inline int fixup_user_fault(struct task_struct *tsk,
1035} 1009}
1036#endif 1010#endif
1037 1011
1038extern int make_pages_present(unsigned long addr, unsigned long end);
1039extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); 1012extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
1040extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, 1013extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
1041 void *buf, int len, int write); 1014 void *buf, int len, int write);
1042 1015
1043int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1016long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1044 unsigned long start, int len, unsigned int foll_flags, 1017 unsigned long start, unsigned long nr_pages,
1045 struct page **pages, struct vm_area_struct **vmas, 1018 unsigned int foll_flags, struct page **pages,
1046 int *nonblocking); 1019 struct vm_area_struct **vmas, int *nonblocking);
1047int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1020long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1048 unsigned long start, int nr_pages, int write, int force, 1021 unsigned long start, unsigned long nr_pages,
1049 struct page **pages, struct vm_area_struct **vmas); 1022 int write, int force, struct page **pages,
1023 struct vm_area_struct **vmas);
1050int get_user_pages_fast(unsigned long start, int nr_pages, int write, 1024int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1051 struct page **pages); 1025 struct page **pages);
1052struct kvec; 1026struct kvec;
@@ -1359,6 +1333,24 @@ extern void free_bootmem_with_active_regions(int nid,
1359 unsigned long max_low_pfn); 1333 unsigned long max_low_pfn);
1360extern void sparse_memory_present_with_active_regions(int nid); 1334extern void sparse_memory_present_with_active_regions(int nid);
1361 1335
1336#define MOVABLEMEM_MAP_MAX MAX_NUMNODES
1337struct movablemem_entry {
1338 unsigned long start_pfn; /* start pfn of memory segment */
1339 unsigned long end_pfn; /* end pfn of memory segment (exclusive) */
1340};
1341
1342struct movablemem_map {
1343 bool acpi; /* true if using SRAT info */
1344 int nr_map;
1345 struct movablemem_entry map[MOVABLEMEM_MAP_MAX];
1346 nodemask_t numa_nodes_hotplug; /* on which nodes we specify memory */
1347 nodemask_t numa_nodes_kernel; /* on which nodes kernel resides in */
1348};
1349
1350extern void __init insert_movablemem_map(unsigned long start_pfn,
1351 unsigned long end_pfn);
1352extern int __init movablemem_map_overlap(unsigned long start_pfn,
1353 unsigned long end_pfn);
1362#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 1354#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
1363 1355
1364#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ 1356#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
@@ -1395,6 +1387,9 @@ extern void setup_per_cpu_pageset(void);
1395extern void zone_pcp_update(struct zone *zone); 1387extern void zone_pcp_update(struct zone *zone);
1396extern void zone_pcp_reset(struct zone *zone); 1388extern void zone_pcp_reset(struct zone *zone);
1397 1389
1390/* page_alloc.c */
1391extern int min_free_kbytes;
1392
1398/* nommu.c */ 1393/* nommu.c */
1399extern atomic_long_t mmap_pages_allocated; 1394extern atomic_long_t mmap_pages_allocated;
1400extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); 1395extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
@@ -1472,13 +1467,24 @@ extern int install_special_mapping(struct mm_struct *mm,
1472extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 1467extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
1473 1468
1474extern unsigned long mmap_region(struct file *file, unsigned long addr, 1469extern unsigned long mmap_region(struct file *file, unsigned long addr,
1475 unsigned long len, unsigned long flags, 1470 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
1476 vm_flags_t vm_flags, unsigned long pgoff); 1471extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1477extern unsigned long do_mmap_pgoff(struct file *, unsigned long, 1472 unsigned long len, unsigned long prot, unsigned long flags,
1478 unsigned long, unsigned long, 1473 unsigned long pgoff, unsigned long *populate);
1479 unsigned long, unsigned long);
1480extern int do_munmap(struct mm_struct *, unsigned long, size_t); 1474extern int do_munmap(struct mm_struct *, unsigned long, size_t);
1481 1475
1476#ifdef CONFIG_MMU
1477extern int __mm_populate(unsigned long addr, unsigned long len,
1478 int ignore_errors);
1479static inline void mm_populate(unsigned long addr, unsigned long len)
1480{
1481 /* Ignore errors */
1482 (void) __mm_populate(addr, len, 1);
1483}
1484#else
1485static inline void mm_populate(unsigned long addr, unsigned long len) {}
1486#endif
1487
1482/* These take the mm semaphore themselves */ 1488/* These take the mm semaphore themselves */
1483extern unsigned long vm_brk(unsigned long, unsigned long); 1489extern unsigned long vm_brk(unsigned long, unsigned long);
1484extern int vm_munmap(unsigned long, size_t); 1490extern int vm_munmap(unsigned long, size_t);
@@ -1623,8 +1629,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1623int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 1629int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1624 unsigned long pfn); 1630 unsigned long pfn);
1625 1631
1626struct page *follow_page(struct vm_area_struct *, unsigned long address, 1632struct page *follow_page_mask(struct vm_area_struct *vma,
1627 unsigned int foll_flags); 1633 unsigned long address, unsigned int foll_flags,
1634 unsigned int *page_mask);
1635
1636static inline struct page *follow_page(struct vm_area_struct *vma,
1637 unsigned long address, unsigned int foll_flags)
1638{
1639 unsigned int unused_page_mask;
1640 return follow_page_mask(vma, address, foll_flags, &unused_page_mask);
1641}
1642
1628#define FOLL_WRITE 0x01 /* check pte is writable */ 1643#define FOLL_WRITE 0x01 /* check pte is writable */
1629#define FOLL_TOUCH 0x02 /* mark page accessed */ 1644#define FOLL_TOUCH 0x02 /* mark page accessed */
1630#define FOLL_GET 0x04 /* do get_page on page */ 1645#define FOLL_GET 0x04 /* do get_page on page */
@@ -1636,6 +1651,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
1636#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ 1651#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
1637#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ 1652#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
1638#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ 1653#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
1654#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
1639 1655
1640typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, 1656typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
1641 void *data); 1657 void *data);
@@ -1707,7 +1723,11 @@ int vmemmap_populate_basepages(struct page *start_page,
1707 unsigned long pages, int node); 1723 unsigned long pages, int node);
1708int vmemmap_populate(struct page *start_page, unsigned long pages, int node); 1724int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
1709void vmemmap_populate_print_last(void); 1725void vmemmap_populate_print_last(void);
1710 1726#ifdef CONFIG_MEMORY_HOTPLUG
1727void vmemmap_free(struct page *memmap, unsigned long nr_pages);
1728#endif
1729void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
1730 unsigned long size);
1711 1731
1712enum mf_flags { 1732enum mf_flags {
1713 MF_COUNT_INCREASED = 1 << 0, 1733 MF_COUNT_INCREASED = 1 << 0,
@@ -1720,7 +1740,7 @@ extern int unpoison_memory(unsigned long pfn);
1720extern int sysctl_memory_failure_early_kill; 1740extern int sysctl_memory_failure_early_kill;
1721extern int sysctl_memory_failure_recovery; 1741extern int sysctl_memory_failure_recovery;
1722extern void shake_page(struct page *p, int access); 1742extern void shake_page(struct page *p, int access);
1723extern atomic_long_t mce_bad_pages; 1743extern atomic_long_t num_poisoned_pages;
1724extern int soft_offline_page(struct page *page, int flags); 1744extern int soft_offline_page(struct page *page, int flags);
1725 1745
1726extern void dump_page(struct page *page); 1746extern void dump_page(struct page *page);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f8f5162a3571..ace9a5f01c64 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -12,6 +12,7 @@
12#include <linux/cpumask.h> 12#include <linux/cpumask.h>
13#include <linux/page-debug-flags.h> 13#include <linux/page-debug-flags.h>
14#include <linux/uprobes.h> 14#include <linux/uprobes.h>
15#include <linux/page-flags-layout.h>
15#include <asm/page.h> 16#include <asm/page.h>
16#include <asm/mmu.h> 17#include <asm/mmu.h>
17 18
@@ -173,7 +174,7 @@ struct page {
173 void *shadow; 174 void *shadow;
174#endif 175#endif
175 176
176#ifdef CONFIG_NUMA_BALANCING 177#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
177 int _last_nid; 178 int _last_nid;
178#endif 179#endif
179} 180}
@@ -414,9 +415,9 @@ struct mm_struct {
414#endif 415#endif
415#ifdef CONFIG_NUMA_BALANCING 416#ifdef CONFIG_NUMA_BALANCING
416 /* 417 /*
417 * numa_next_scan is the next time when the PTEs will me marked 418 * numa_next_scan is the next time that the PTEs will be marked
418 * pte_numa to gather statistics and migrate pages to new nodes 419 * pte_numa. NUMA hinting faults will gather statistics and migrate
419 * if necessary 420 * pages to new nodes if necessary.
420 */ 421 */
421 unsigned long numa_next_scan; 422 unsigned long numa_next_scan;
422 423
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 9aa863da287f..61c7a87e5d2b 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -79,6 +79,8 @@ calc_vm_flag_bits(unsigned long flags)
79{ 79{
80 return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | 80 return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
81 _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | 81 _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
82 _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); 82 ((flags & MAP_LOCKED) ? (VM_LOCKED | VM_POPULATE) : 0) |
83 (((flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE) ?
84 VM_POPULATE : 0);
83} 85}
84#endif /* _LINUX_MMAN_H */ 86#endif /* _LINUX_MMAN_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 73b64a38b984..ede274957e05 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -15,7 +15,7 @@
15#include <linux/seqlock.h> 15#include <linux/seqlock.h>
16#include <linux/nodemask.h> 16#include <linux/nodemask.h>
17#include <linux/pageblock-flags.h> 17#include <linux/pageblock-flags.h>
18#include <generated/bounds.h> 18#include <linux/page-flags-layout.h>
19#include <linux/atomic.h> 19#include <linux/atomic.h>
20#include <asm/page.h> 20#include <asm/page.h>
21 21
@@ -57,7 +57,9 @@ enum {
57 */ 57 */
58 MIGRATE_CMA, 58 MIGRATE_CMA,
59#endif 59#endif
60#ifdef CONFIG_MEMORY_ISOLATION
60 MIGRATE_ISOLATE, /* can't allocate from here */ 61 MIGRATE_ISOLATE, /* can't allocate from here */
62#endif
61 MIGRATE_TYPES 63 MIGRATE_TYPES
62}; 64};
63 65
@@ -308,24 +310,6 @@ enum zone_type {
308 310
309#ifndef __GENERATING_BOUNDS_H 311#ifndef __GENERATING_BOUNDS_H
310 312
311/*
312 * When a memory allocation must conform to specific limitations (such
313 * as being suitable for DMA) the caller will pass in hints to the
314 * allocator in the gfp_mask, in the zone modifier bits. These bits
315 * are used to select a priority ordered list of memory zones which
316 * match the requested limits. See gfp_zone() in include/linux/gfp.h
317 */
318
319#if MAX_NR_ZONES < 2
320#define ZONES_SHIFT 0
321#elif MAX_NR_ZONES <= 2
322#define ZONES_SHIFT 1
323#elif MAX_NR_ZONES <= 4
324#define ZONES_SHIFT 2
325#else
326#error ZONES_SHIFT -- too many zones configured adjust calculation
327#endif
328
329struct zone { 313struct zone {
330 /* Fields commonly accessed by the page allocator */ 314 /* Fields commonly accessed by the page allocator */
331 315
@@ -543,6 +527,26 @@ static inline int zone_is_oom_locked(const struct zone *zone)
543 return test_bit(ZONE_OOM_LOCKED, &zone->flags); 527 return test_bit(ZONE_OOM_LOCKED, &zone->flags);
544} 528}
545 529
530static inline unsigned zone_end_pfn(const struct zone *zone)
531{
532 return zone->zone_start_pfn + zone->spanned_pages;
533}
534
535static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
536{
537 return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
538}
539
540static inline bool zone_is_initialized(struct zone *zone)
541{
542 return !!zone->wait_table;
543}
544
545static inline bool zone_is_empty(struct zone *zone)
546{
547 return zone->spanned_pages == 0;
548}
549
546/* 550/*
547 * The "priority" of VM scanning is how much of the queues we will scan in one 551 * The "priority" of VM scanning is how much of the queues we will scan in one
548 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the 552 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
@@ -752,11 +756,17 @@ typedef struct pglist_data {
752#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) 756#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr))
753 757
754#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 758#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
759#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
755 760
756#define node_end_pfn(nid) ({\ 761static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
757 pg_data_t *__pgdat = NODE_DATA(nid);\ 762{
758 __pgdat->node_start_pfn + __pgdat->node_spanned_pages;\ 763 return pgdat->node_start_pfn + pgdat->node_spanned_pages;
759}) 764}
765
766static inline bool pgdat_is_empty(pg_data_t *pgdat)
767{
768 return !pgdat->node_start_pfn && !pgdat->node_spanned_pages;
769}
760 770
761#include <linux/memory_hotplug.h> 771#include <linux/memory_hotplug.h>
762 772
@@ -1053,8 +1063,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
1053 * PA_SECTION_SHIFT physical address to/from section number 1063 * PA_SECTION_SHIFT physical address to/from section number
1054 * PFN_SECTION_SHIFT pfn to/from section number 1064 * PFN_SECTION_SHIFT pfn to/from section number
1055 */ 1065 */
1056#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
1057
1058#define PA_SECTION_SHIFT (SECTION_SIZE_BITS) 1066#define PA_SECTION_SHIFT (SECTION_SIZE_BITS)
1059#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) 1067#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT)
1060 1068
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
new file mode 100644
index 000000000000..93506a114034
--- /dev/null
+++ b/include/linux/page-flags-layout.h
@@ -0,0 +1,88 @@
1#ifndef PAGE_FLAGS_LAYOUT_H
2#define PAGE_FLAGS_LAYOUT_H
3
4#include <linux/numa.h>
5#include <generated/bounds.h>
6
7/*
8 * When a memory allocation must conform to specific limitations (such
9 * as being suitable for DMA) the caller will pass in hints to the
10 * allocator in the gfp_mask, in the zone modifier bits. These bits
11 * are used to select a priority ordered list of memory zones which
12 * match the requested limits. See gfp_zone() in include/linux/gfp.h
13 */
14#if MAX_NR_ZONES < 2
15#define ZONES_SHIFT 0
16#elif MAX_NR_ZONES <= 2
17#define ZONES_SHIFT 1
18#elif MAX_NR_ZONES <= 4
19#define ZONES_SHIFT 2
20#else
21#error ZONES_SHIFT -- too many zones configured adjust calculation
22#endif
23
24#ifdef CONFIG_SPARSEMEM
25#include <asm/sparsemem.h>
26
27/* SECTION_SHIFT #bits space required to store a section # */
28#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
29
30#endif /* CONFIG_SPARSEMEM */
31
32/*
33 * page->flags layout:
34 *
35 * There are five possibilities for how page->flags get laid out. The first
36 * pair is for the normal case without sparsemem. The second pair is for
37 * sparsemem when there is plenty of space for node and section information.
38 * The last is when there is insufficient space in page->flags and a separate
39 * lookup is necessary.
40 *
41 * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS |
42 * " plus space for last_nid: | NODE | ZONE | LAST_NID ... | FLAGS |
43 * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
44 * " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS |
45 * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS |
46 */
47#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
48#define SECTIONS_WIDTH SECTIONS_SHIFT
49#else
50#define SECTIONS_WIDTH 0
51#endif
52
53#define ZONES_WIDTH ZONES_SHIFT
54
55#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
56#define NODES_WIDTH NODES_SHIFT
57#else
58#ifdef CONFIG_SPARSEMEM_VMEMMAP
59#error "Vmemmap: No space for nodes field in page flags"
60#endif
61#define NODES_WIDTH 0
62#endif
63
64#ifdef CONFIG_NUMA_BALANCING
65#define LAST_NID_SHIFT NODES_SHIFT
66#else
67#define LAST_NID_SHIFT 0
68#endif
69
70#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
71#define LAST_NID_WIDTH LAST_NID_SHIFT
72#else
73#define LAST_NID_WIDTH 0
74#endif
75
76/*
77 * We are going to use the flags for the page to node mapping if its in
78 * there. This includes the case where there is no node, so it is implicit.
79 */
80#if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
81#define NODE_NOT_IN_PAGE_FLAGS
82#endif
83
84#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0
85#define LAST_NID_NOT_IN_PAGE_FLAGS
86#endif
87
88#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index a92061e08d48..3fff8e774067 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -1,6 +1,25 @@
1#ifndef __LINUX_PAGEISOLATION_H 1#ifndef __LINUX_PAGEISOLATION_H
2#define __LINUX_PAGEISOLATION_H 2#define __LINUX_PAGEISOLATION_H
3 3
4#ifdef CONFIG_MEMORY_ISOLATION
5static inline bool is_migrate_isolate_page(struct page *page)
6{
7 return get_pageblock_migratetype(page) == MIGRATE_ISOLATE;
8}
9static inline bool is_migrate_isolate(int migratetype)
10{
11 return migratetype == MIGRATE_ISOLATE;
12}
13#else
14static inline bool is_migrate_isolate_page(struct page *page)
15{
16 return false;
17}
18static inline bool is_migrate_isolate(int migratetype)
19{
20 return false;
21}
22#endif
4 23
5bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 24bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
6 bool skip_hwpoisoned_pages); 25 bool skip_hwpoisoned_pages);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 97bcf23e045a..e5d7230332a4 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -537,6 +537,7 @@ struct dev_pm_info {
537 unsigned int irq_safe:1; 537 unsigned int irq_safe:1;
538 unsigned int use_autosuspend:1; 538 unsigned int use_autosuspend:1;
539 unsigned int timer_autosuspends:1; 539 unsigned int timer_autosuspends:1;
540 unsigned int memalloc_noio:1;
540 enum rpm_request request; 541 enum rpm_request request;
541 enum rpm_status runtime_status; 542 enum rpm_status runtime_status;
542 int runtime_error; 543 int runtime_error;
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index c785c215abfc..7d7e09efff9b 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -47,6 +47,7 @@ extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
47extern unsigned long pm_runtime_autosuspend_expiration(struct device *dev); 47extern unsigned long pm_runtime_autosuspend_expiration(struct device *dev);
48extern void pm_runtime_update_max_time_suspended(struct device *dev, 48extern void pm_runtime_update_max_time_suspended(struct device *dev,
49 s64 delta_ns); 49 s64 delta_ns);
50extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable);
50 51
51static inline bool pm_children_suspended(struct device *dev) 52static inline bool pm_children_suspended(struct device *dev)
52{ 53{
@@ -156,6 +157,8 @@ static inline void pm_runtime_set_autosuspend_delay(struct device *dev,
156 int delay) {} 157 int delay) {}
157static inline unsigned long pm_runtime_autosuspend_expiration( 158static inline unsigned long pm_runtime_autosuspend_expiration(
158 struct device *dev) { return 0; } 159 struct device *dev) { return 0; }
160static inline void pm_runtime_set_memalloc_noio(struct device *dev,
161 bool enable){}
159 162
160#endif /* !CONFIG_PM_RUNTIME */ 163#endif /* !CONFIG_PM_RUNTIME */
161 164
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c20635c527a9..6dacb93a6d94 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -123,7 +123,7 @@ static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
123 down_write(&anon_vma->root->rwsem); 123 down_write(&anon_vma->root->rwsem);
124} 124}
125 125
126static inline void anon_vma_unlock(struct anon_vma *anon_vma) 126static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
127{ 127{
128 up_write(&anon_vma->root->rwsem); 128 up_write(&anon_vma->root->rwsem);
129} 129}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e4112aad2964..c2182b53dace 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -51,6 +51,7 @@ struct sched_param {
51#include <linux/cred.h> 51#include <linux/cred.h>
52#include <linux/llist.h> 52#include <linux/llist.h>
53#include <linux/uidgid.h> 53#include <linux/uidgid.h>
54#include <linux/gfp.h>
54 55
55#include <asm/processor.h> 56#include <asm/processor.h>
56 57
@@ -1791,6 +1792,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
1791#define PF_FROZEN 0x00010000 /* frozen for system suspend */ 1792#define PF_FROZEN 0x00010000 /* frozen for system suspend */
1792#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ 1793#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */
1793#define PF_KSWAPD 0x00040000 /* I am kswapd */ 1794#define PF_KSWAPD 0x00040000 /* I am kswapd */
1795#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */
1794#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ 1796#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
1795#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 1797#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
1796#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ 1798#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
@@ -1828,6 +1830,26 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
1828#define tsk_used_math(p) ((p)->flags & PF_USED_MATH) 1830#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
1829#define used_math() tsk_used_math(current) 1831#define used_math() tsk_used_math(current)
1830 1832
1833/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */
1834static inline gfp_t memalloc_noio_flags(gfp_t flags)
1835{
1836 if (unlikely(current->flags & PF_MEMALLOC_NOIO))
1837 flags &= ~__GFP_IO;
1838 return flags;
1839}
1840
1841static inline unsigned int memalloc_noio_save(void)
1842{
1843 unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
1844 current->flags |= PF_MEMALLOC_NOIO;
1845 return flags;
1846}
1847
1848static inline void memalloc_noio_restore(unsigned int flags)
1849{
1850 current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
1851}
1852
1831/* 1853/*
1832 * task->jobctl flags 1854 * task->jobctl flags
1833 */ 1855 */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 68df9c17fbbb..2818a123f3ea 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -8,7 +8,7 @@
8#include <linux/memcontrol.h> 8#include <linux/memcontrol.h>
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/node.h> 10#include <linux/node.h>
11 11#include <linux/fs.h>
12#include <linux/atomic.h> 12#include <linux/atomic.h>
13#include <asm/page.h> 13#include <asm/page.h>
14 14
@@ -156,7 +156,7 @@ enum {
156 SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ 156 SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
157}; 157};
158 158
159#define SWAP_CLUSTER_MAX 32 159#define SWAP_CLUSTER_MAX 32UL
160#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX 160#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
161 161
162/* 162/*
@@ -202,6 +202,18 @@ struct swap_info_struct {
202 unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ 202 unsigned long *frontswap_map; /* frontswap in-use, one bit per page */
203 atomic_t frontswap_pages; /* frontswap pages in-use counter */ 203 atomic_t frontswap_pages; /* frontswap pages in-use counter */
204#endif 204#endif
205 spinlock_t lock; /*
206 * protect map scan related fields like
207 * swap_map, lowest_bit, highest_bit,
208 * inuse_pages, cluster_next,
209 * cluster_nr, lowest_alloc and
210 * highest_alloc. other fields are only
211 * changed at swapon/swapoff, so are
212 * protected by swap_lock. changing
213 * flags need hold this lock and
214 * swap_lock. If both locks need hold,
215 * hold swap_lock first.
216 */
205}; 217};
206 218
207struct swap_list_t { 219struct swap_list_t {
@@ -209,15 +221,12 @@ struct swap_list_t {
209 int next; /* swapfile to be used next */ 221 int next; /* swapfile to be used next */
210}; 222};
211 223
212/* Swap 50% full? Release swapcache more aggressively.. */
213#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
214
215/* linux/mm/page_alloc.c */ 224/* linux/mm/page_alloc.c */
216extern unsigned long totalram_pages; 225extern unsigned long totalram_pages;
217extern unsigned long totalreserve_pages; 226extern unsigned long totalreserve_pages;
218extern unsigned long dirty_balance_reserve; 227extern unsigned long dirty_balance_reserve;
219extern unsigned int nr_free_buffer_pages(void); 228extern unsigned long nr_free_buffer_pages(void);
220extern unsigned int nr_free_pagecache_pages(void); 229extern unsigned long nr_free_pagecache_pages(void);
221 230
222/* Definition of global_page_state not available yet */ 231/* Definition of global_page_state not available yet */
223#define nr_free_pages() global_page_state(NR_FREE_PAGES) 232#define nr_free_pages() global_page_state(NR_FREE_PAGES)
@@ -266,7 +275,7 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
266extern unsigned long shrink_all_memory(unsigned long nr_pages); 275extern unsigned long shrink_all_memory(unsigned long nr_pages);
267extern int vm_swappiness; 276extern int vm_swappiness;
268extern int remove_mapping(struct address_space *mapping, struct page *page); 277extern int remove_mapping(struct address_space *mapping, struct page *page);
269extern long vm_total_pages; 278extern unsigned long vm_total_pages;
270 279
271#ifdef CONFIG_NUMA 280#ifdef CONFIG_NUMA
272extern int zone_reclaim_mode; 281extern int zone_reclaim_mode;
@@ -330,8 +339,9 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *,
330 sector_t *); 339 sector_t *);
331 340
332/* linux/mm/swap_state.c */ 341/* linux/mm/swap_state.c */
333extern struct address_space swapper_space; 342extern struct address_space swapper_spaces[];
334#define total_swapcache_pages swapper_space.nrpages 343#define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
344extern unsigned long total_swapcache_pages(void);
335extern void show_swap_cache_info(void); 345extern void show_swap_cache_info(void);
336extern int add_to_swap(struct page *); 346extern int add_to_swap(struct page *);
337extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); 347extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
@@ -346,8 +356,20 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t,
346 struct vm_area_struct *vma, unsigned long addr); 356 struct vm_area_struct *vma, unsigned long addr);
347 357
348/* linux/mm/swapfile.c */ 358/* linux/mm/swapfile.c */
349extern long nr_swap_pages; 359extern atomic_long_t nr_swap_pages;
350extern long total_swap_pages; 360extern long total_swap_pages;
361
362/* Swap 50% full? Release swapcache more aggressively.. */
363static inline bool vm_swap_full(void)
364{
365 return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
366}
367
368static inline long get_nr_swap_pages(void)
369{
370 return atomic_long_read(&nr_swap_pages);
371}
372
351extern void si_swapinfo(struct sysinfo *); 373extern void si_swapinfo(struct sysinfo *);
352extern swp_entry_t get_swap_page(void); 374extern swp_entry_t get_swap_page(void);
353extern swp_entry_t get_swap_page_of_type(int); 375extern swp_entry_t get_swap_page_of_type(int);
@@ -380,9 +402,10 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
380 402
381#else /* CONFIG_SWAP */ 403#else /* CONFIG_SWAP */
382 404
383#define nr_swap_pages 0L 405#define get_nr_swap_pages() 0L
384#define total_swap_pages 0L 406#define total_swap_pages 0L
385#define total_swapcache_pages 0UL 407#define total_swapcache_pages() 0UL
408#define vm_swap_full() 0
386 409
387#define si_swapinfo(val) \ 410#define si_swapinfo(val) \
388 do { (val)->freeswap = (val)->totalswap = 0; } while (0) 411 do { (val)->freeswap = (val)->totalswap = 0; } while (0)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index fce0a2799d43..bd6cf61142be 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -36,7 +36,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
36#endif 36#endif
37 PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, 37 PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL,
38 KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, 38 KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
39 KSWAPD_SKIP_CONGESTION_WAIT,
40 PAGEOUTRUN, ALLOCSTALL, PGROTATED, 39 PAGEOUTRUN, ALLOCSTALL, PGROTATED,
41#ifdef CONFIG_NUMA_BALANCING 40#ifdef CONFIG_NUMA_BALANCING
42 NUMA_PTE_UPDATES, 41 NUMA_PTE_UPDATES,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index a13291f7da88..5fd71a7d0dfd 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -85,7 +85,7 @@ static inline void vm_events_fold_cpu(int cpu)
85#define count_vm_numa_events(x, y) count_vm_events(x, y) 85#define count_vm_numa_events(x, y) count_vm_events(x, y)
86#else 86#else
87#define count_vm_numa_event(x) do {} while (0) 87#define count_vm_numa_event(x) do {} while (0)
88#define count_vm_numa_events(x, y) do {} while (0) 88#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
89#endif /* CONFIG_NUMA_BALANCING */ 89#endif /* CONFIG_NUMA_BALANCING */
90 90
91#define __count_zone_vm_events(item, zone, delta) \ 91#define __count_zone_vm_events(item, zone, delta) \
diff --git a/ipc/shm.c b/ipc/shm.c
index 4fa6d8fee730..be3ec9ae454e 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -967,11 +967,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
967 unsigned long flags; 967 unsigned long flags;
968 unsigned long prot; 968 unsigned long prot;
969 int acc_mode; 969 int acc_mode;
970 unsigned long user_addr;
971 struct ipc_namespace *ns; 970 struct ipc_namespace *ns;
972 struct shm_file_data *sfd; 971 struct shm_file_data *sfd;
973 struct path path; 972 struct path path;
974 fmode_t f_mode; 973 fmode_t f_mode;
974 unsigned long populate = 0;
975 975
976 err = -EINVAL; 976 err = -EINVAL;
977 if (shmid < 0) 977 if (shmid < 0)
@@ -1070,13 +1070,15 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
1070 goto invalid; 1070 goto invalid;
1071 } 1071 }
1072 1072
1073 user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0); 1073 addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
1074 *raddr = user_addr; 1074 *raddr = addr;
1075 err = 0; 1075 err = 0;
1076 if (IS_ERR_VALUE(user_addr)) 1076 if (IS_ERR_VALUE(addr))
1077 err = (long)user_addr; 1077 err = (long)addr;
1078invalid: 1078invalid:
1079 up_write(&current->mm->mmap_sem); 1079 up_write(&current->mm->mmap_sem);
1080 if (populate)
1081 mm_populate(addr, populate);
1080 1082
1081out_fput: 1083out_fput:
1082 fput(file); 1084 fput(file);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3a673a3b0c6b..053dfd7692d1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
1132 */ 1132 */
1133static int select_fallback_rq(int cpu, struct task_struct *p) 1133static int select_fallback_rq(int cpu, struct task_struct *p)
1134{ 1134{
1135 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 1135 int nid = cpu_to_node(cpu);
1136 const struct cpumask *nodemask = NULL;
1136 enum { cpuset, possible, fail } state = cpuset; 1137 enum { cpuset, possible, fail } state = cpuset;
1137 int dest_cpu; 1138 int dest_cpu;
1138 1139
1139 /* Look for allowed, online CPU in same node. */ 1140 /*
1140 for_each_cpu(dest_cpu, nodemask) { 1141 * If the node that the cpu is on has been offlined, cpu_to_node()
1141 if (!cpu_online(dest_cpu)) 1142 * will return -1. There is no cpu on the node, and we should
1142 continue; 1143 * select the cpu on the other node.
1143 if (!cpu_active(dest_cpu)) 1144 */
1144 continue; 1145 if (nid != -1) {
1145 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1146 nodemask = cpumask_of_node(nid);
1146 return dest_cpu; 1147
1148 /* Look for allowed, online CPU in same node. */
1149 for_each_cpu(dest_cpu, nodemask) {
1150 if (!cpu_online(dest_cpu))
1151 continue;
1152 if (!cpu_active(dest_cpu))
1153 continue;
1154 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1155 return dest_cpu;
1156 }
1147 } 1157 }
1148 1158
1149 for (;;) { 1159 for (;;) {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 467d8b923fcd..95e9e55602a8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -105,7 +105,6 @@ extern char core_pattern[];
105extern unsigned int core_pipe_limit; 105extern unsigned int core_pipe_limit;
106#endif 106#endif
107extern int pid_max; 107extern int pid_max;
108extern int min_free_kbytes;
109extern int pid_max_min, pid_max_max; 108extern int pid_max_min, pid_max_max;
110extern int sysctl_drop_caches; 109extern int sysctl_drop_caches;
111extern int percpu_pagelist_fraction; 110extern int percpu_pagelist_fraction;
diff --git a/mm/Kconfig b/mm/Kconfig
index 0b23db9a8791..2c7aea7106f9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -162,10 +162,16 @@ config MOVABLE_NODE
162 Say Y here if you want to hotplug a whole node. 162 Say Y here if you want to hotplug a whole node.
163 Say N here if you want kernel to use memory on all nodes evenly. 163 Say N here if you want kernel to use memory on all nodes evenly.
164 164
165#
166# Only be set on architectures that have completely implemented memory hotplug
167# feature. If you are not sure, don't touch it.
168#
169config HAVE_BOOTMEM_INFO_NODE
170 def_bool n
171
165# eventually, we can have this option just 'select SPARSEMEM' 172# eventually, we can have this option just 'select SPARSEMEM'
166config MEMORY_HOTPLUG 173config MEMORY_HOTPLUG
167 bool "Allow for memory hot-add" 174 bool "Allow for memory hot-add"
168 select MEMORY_ISOLATION
169 depends on SPARSEMEM || X86_64_ACPI_NUMA 175 depends on SPARSEMEM || X86_64_ACPI_NUMA
170 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG 176 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
171 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) 177 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -176,6 +182,8 @@ config MEMORY_HOTPLUG_SPARSE
176 182
177config MEMORY_HOTREMOVE 183config MEMORY_HOTREMOVE
178 bool "Allow for memory hot remove" 184 bool "Allow for memory hot remove"
185 select MEMORY_ISOLATION
186 select HAVE_BOOTMEM_INFO_NODE if X86_64
179 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE 187 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
180 depends on MIGRATION 188 depends on MIGRATION
181 189
diff --git a/mm/compaction.c b/mm/compaction.c
index c62bd063d766..05ccb4cc0bdb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -15,6 +15,7 @@
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include <linux/balloon_compaction.h> 17#include <linux/balloon_compaction.h>
18#include <linux/page-isolation.h>
18#include "internal.h" 19#include "internal.h"
19 20
20#ifdef CONFIG_COMPACTION 21#ifdef CONFIG_COMPACTION
@@ -85,7 +86,7 @@ static inline bool isolation_suitable(struct compact_control *cc,
85static void __reset_isolation_suitable(struct zone *zone) 86static void __reset_isolation_suitable(struct zone *zone)
86{ 87{
87 unsigned long start_pfn = zone->zone_start_pfn; 88 unsigned long start_pfn = zone->zone_start_pfn;
88 unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; 89 unsigned long end_pfn = zone_end_pfn(zone);
89 unsigned long pfn; 90 unsigned long pfn;
90 91
91 zone->compact_cached_migrate_pfn = start_pfn; 92 zone->compact_cached_migrate_pfn = start_pfn;
@@ -215,7 +216,10 @@ static bool suitable_migration_target(struct page *page)
215 int migratetype = get_pageblock_migratetype(page); 216 int migratetype = get_pageblock_migratetype(page);
216 217
217 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ 218 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
218 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) 219 if (migratetype == MIGRATE_RESERVE)
220 return false;
221
222 if (is_migrate_isolate(migratetype))
219 return false; 223 return false;
220 224
221 /* If the page is a large free page, then allow migration */ 225 /* If the page is a large free page, then allow migration */
@@ -611,8 +615,7 @@ check_compact_cluster:
611 continue; 615 continue;
612 616
613next_pageblock: 617next_pageblock:
614 low_pfn += pageblock_nr_pages; 618 low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
615 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
616 last_pageblock_nr = pageblock_nr; 619 last_pageblock_nr = pageblock_nr;
617 } 620 }
618 621
@@ -644,7 +647,7 @@ static void isolate_freepages(struct zone *zone,
644 struct compact_control *cc) 647 struct compact_control *cc)
645{ 648{
646 struct page *page; 649 struct page *page;
647 unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; 650 unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn;
648 int nr_freepages = cc->nr_freepages; 651 int nr_freepages = cc->nr_freepages;
649 struct list_head *freelist = &cc->freepages; 652 struct list_head *freelist = &cc->freepages;
650 653
@@ -663,7 +666,7 @@ static void isolate_freepages(struct zone *zone,
663 */ 666 */
664 high_pfn = min(low_pfn, pfn); 667 high_pfn = min(low_pfn, pfn);
665 668
666 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 669 z_end_pfn = zone_end_pfn(zone);
667 670
668 /* 671 /*
669 * Isolate free pages until enough are available to migrate the 672 * Isolate free pages until enough are available to migrate the
@@ -706,7 +709,7 @@ static void isolate_freepages(struct zone *zone,
706 * only scans within a pageblock 709 * only scans within a pageblock
707 */ 710 */
708 end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 711 end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
709 end_pfn = min(end_pfn, zone_end_pfn); 712 end_pfn = min(end_pfn, z_end_pfn);
710 isolated = isolate_freepages_block(cc, pfn, end_pfn, 713 isolated = isolate_freepages_block(cc, pfn, end_pfn,
711 freelist, false); 714 freelist, false);
712 nr_freepages += isolated; 715 nr_freepages += isolated;
@@ -795,7 +798,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
795 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); 798 low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
796 799
797 /* Only scan within a pageblock boundary */ 800 /* Only scan within a pageblock boundary */
798 end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); 801 end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
799 802
800 /* Do not cross the free scanner or scan within a memory hole */ 803 /* Do not cross the free scanner or scan within a memory hole */
801 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { 804 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
@@ -920,7 +923,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
920{ 923{
921 int ret; 924 int ret;
922 unsigned long start_pfn = zone->zone_start_pfn; 925 unsigned long start_pfn = zone->zone_start_pfn;
923 unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; 926 unsigned long end_pfn = zone_end_pfn(zone);
924 927
925 ret = compaction_suitable(zone, cc->order); 928 ret = compaction_suitable(zone, cc->order);
926 switch (ret) { 929 switch (ret) {
@@ -977,7 +980,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
977 980
978 nr_migrate = cc->nr_migratepages; 981 nr_migrate = cc->nr_migratepages;
979 err = migrate_pages(&cc->migratepages, compaction_alloc, 982 err = migrate_pages(&cc->migratepages, compaction_alloc,
980 (unsigned long)cc, false, 983 (unsigned long)cc,
981 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, 984 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
982 MR_COMPACTION); 985 MR_COMPACTION);
983 update_nr_listpages(cc); 986 update_nr_listpages(cc);
@@ -1086,7 +1089,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1086 1089
1087 1090
1088/* Compact all zones within a node */ 1091/* Compact all zones within a node */
1089static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) 1092static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1090{ 1093{
1091 int zoneid; 1094 int zoneid;
1092 struct zone *zone; 1095 struct zone *zone;
@@ -1119,28 +1122,26 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1119 VM_BUG_ON(!list_empty(&cc->freepages)); 1122 VM_BUG_ON(!list_empty(&cc->freepages));
1120 VM_BUG_ON(!list_empty(&cc->migratepages)); 1123 VM_BUG_ON(!list_empty(&cc->migratepages));
1121 } 1124 }
1122
1123 return 0;
1124} 1125}
1125 1126
1126int compact_pgdat(pg_data_t *pgdat, int order) 1127void compact_pgdat(pg_data_t *pgdat, int order)
1127{ 1128{
1128 struct compact_control cc = { 1129 struct compact_control cc = {
1129 .order = order, 1130 .order = order,
1130 .sync = false, 1131 .sync = false,
1131 }; 1132 };
1132 1133
1133 return __compact_pgdat(pgdat, &cc); 1134 __compact_pgdat(pgdat, &cc);
1134} 1135}
1135 1136
1136static int compact_node(int nid) 1137static void compact_node(int nid)
1137{ 1138{
1138 struct compact_control cc = { 1139 struct compact_control cc = {
1139 .order = -1, 1140 .order = -1,
1140 .sync = true, 1141 .sync = true,
1141 }; 1142 };
1142 1143
1143 return __compact_pgdat(NODE_DATA(nid), &cc); 1144 __compact_pgdat(NODE_DATA(nid), &cc);
1144} 1145}
1145 1146
1146/* Compact all nodes in the system */ 1147/* Compact all nodes in the system */
diff --git a/mm/fadvise.c b/mm/fadvise.c
index a47f0f50c89f..909ec558625c 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -17,6 +17,7 @@
17#include <linux/fadvise.h> 17#include <linux/fadvise.h>
18#include <linux/writeback.h> 18#include <linux/writeback.h>
19#include <linux/syscalls.h> 19#include <linux/syscalls.h>
20#include <linux/swap.h>
20 21
21#include <asm/unistd.h> 22#include <asm/unistd.h>
22 23
@@ -120,9 +121,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
120 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; 121 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
121 end_index = (endbyte >> PAGE_CACHE_SHIFT); 122 end_index = (endbyte >> PAGE_CACHE_SHIFT);
122 123
123 if (end_index >= start_index) 124 if (end_index >= start_index) {
124 invalidate_mapping_pages(mapping, start_index, 125 unsigned long count = invalidate_mapping_pages(mapping,
126 start_index, end_index);
127
128 /*
129 * If fewer pages were invalidated than expected then
130 * it is possible that some of the pages were on
131 * a per-cpu pagevec for a remote CPU. Drain all
132 * pagevecs and try again.
133 */
134 if (count < (end_index - start_index + 1)) {
135 lru_add_drain_all();
136 invalidate_mapping_pages(mapping, start_index,
125 end_index); 137 end_index);
138 }
139 }
126 break; 140 break;
127 default: 141 default:
128 ret = -EINVAL; 142 ret = -EINVAL;
diff --git a/mm/fremap.c b/mm/fremap.c
index a0aaf0e56800..0cd4c11488ed 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -129,6 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
129 struct vm_area_struct *vma; 129 struct vm_area_struct *vma;
130 int err = -EINVAL; 130 int err = -EINVAL;
131 int has_write_lock = 0; 131 int has_write_lock = 0;
132 vm_flags_t vm_flags;
132 133
133 if (prot) 134 if (prot)
134 return err; 135 return err;
@@ -160,15 +161,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
160 /* 161 /*
161 * Make sure the vma is shared, that it supports prefaulting, 162 * Make sure the vma is shared, that it supports prefaulting,
162 * and that the remapped range is valid and fully within 163 * and that the remapped range is valid and fully within
163 * the single existing vma. vm_private_data is used as a 164 * the single existing vma.
164 * swapout cursor in a VM_NONLINEAR vma.
165 */ 165 */
166 if (!vma || !(vma->vm_flags & VM_SHARED)) 166 if (!vma || !(vma->vm_flags & VM_SHARED))
167 goto out; 167 goto out;
168 168
169 if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
170 goto out;
171
172 if (!vma->vm_ops || !vma->vm_ops->remap_pages) 169 if (!vma->vm_ops || !vma->vm_ops->remap_pages)
173 goto out; 170 goto out;
174 171
@@ -177,6 +174,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
177 174
178 /* Must set VM_NONLINEAR before any pages are populated. */ 175 /* Must set VM_NONLINEAR before any pages are populated. */
179 if (!(vma->vm_flags & VM_NONLINEAR)) { 176 if (!(vma->vm_flags & VM_NONLINEAR)) {
177 /*
178 * vm_private_data is used as a swapout cursor
179 * in a VM_NONLINEAR vma.
180 */
181 if (vma->vm_private_data)
182 goto out;
183
180 /* Don't need a nonlinear mapping, exit success */ 184 /* Don't need a nonlinear mapping, exit success */
181 if (pgoff == linear_page_index(vma, start)) { 185 if (pgoff == linear_page_index(vma, start)) {
182 err = 0; 186 err = 0;
@@ -184,6 +188,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
184 } 188 }
185 189
186 if (!has_write_lock) { 190 if (!has_write_lock) {
191get_write_lock:
187 up_read(&mm->mmap_sem); 192 up_read(&mm->mmap_sem);
188 down_write(&mm->mmap_sem); 193 down_write(&mm->mmap_sem);
189 has_write_lock = 1; 194 has_write_lock = 1;
@@ -199,9 +204,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
199 unsigned long addr; 204 unsigned long addr;
200 struct file *file = get_file(vma->vm_file); 205 struct file *file = get_file(vma->vm_file);
201 206
202 flags &= MAP_NONBLOCK; 207 vm_flags = vma->vm_flags;
203 addr = mmap_region(file, start, size, 208 if (!(flags & MAP_NONBLOCK))
204 flags, vma->vm_flags, pgoff); 209 vm_flags |= VM_POPULATE;
210 addr = mmap_region(file, start, size, vm_flags, pgoff);
205 fput(file); 211 fput(file);
206 if (IS_ERR_VALUE(addr)) { 212 if (IS_ERR_VALUE(addr)) {
207 err = addr; 213 err = addr;
@@ -220,32 +226,26 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
220 mutex_unlock(&mapping->i_mmap_mutex); 226 mutex_unlock(&mapping->i_mmap_mutex);
221 } 227 }
222 228
229 if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) {
230 if (!has_write_lock)
231 goto get_write_lock;
232 vma->vm_flags |= VM_POPULATE;
233 }
234
223 if (vma->vm_flags & VM_LOCKED) { 235 if (vma->vm_flags & VM_LOCKED) {
224 /* 236 /*
225 * drop PG_Mlocked flag for over-mapped range 237 * drop PG_Mlocked flag for over-mapped range
226 */ 238 */
227 vm_flags_t saved_flags = vma->vm_flags; 239 if (!has_write_lock)
240 goto get_write_lock;
241 vm_flags = vma->vm_flags;
228 munlock_vma_pages_range(vma, start, start + size); 242 munlock_vma_pages_range(vma, start, start + size);
229 vma->vm_flags = saved_flags; 243 vma->vm_flags = vm_flags;
230 } 244 }
231 245
232 mmu_notifier_invalidate_range_start(mm, start, start + size); 246 mmu_notifier_invalidate_range_start(mm, start, start + size);
233 err = vma->vm_ops->remap_pages(vma, start, size, pgoff); 247 err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
234 mmu_notifier_invalidate_range_end(mm, start, start + size); 248 mmu_notifier_invalidate_range_end(mm, start, start + size);
235 if (!err && !(flags & MAP_NONBLOCK)) {
236 if (vma->vm_flags & VM_LOCKED) {
237 /*
238 * might be mapping previously unmapped range of file
239 */
240 mlock_vma_pages_range(vma, start, start + size);
241 } else {
242 if (unlikely(has_write_lock)) {
243 downgrade_write(&mm->mmap_sem);
244 has_write_lock = 0;
245 }
246 make_pages_present(start, start+size);
247 }
248 }
249 249
250 /* 250 /*
251 * We can't clear VM_NONLINEAR because we'd have to do 251 * We can't clear VM_NONLINEAR because we'd have to do
@@ -254,10 +254,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
254 */ 254 */
255 255
256out: 256out:
257 vm_flags = vma->vm_flags;
257 if (likely(!has_write_lock)) 258 if (likely(!has_write_lock))
258 up_read(&mm->mmap_sem); 259 up_read(&mm->mmap_sem);
259 else 260 else
260 up_write(&mm->mmap_sem); 261 up_write(&mm->mmap_sem);
262 if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
263 mm_populate(start, size);
261 264
262 return err; 265 return err;
263} 266}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b5783d81eda9..bfa142e67b1c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -20,6 +20,7 @@
20#include <linux/mman.h> 20#include <linux/mman.h>
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/migrate.h> 22#include <linux/migrate.h>
23#include <linux/hashtable.h>
23 24
24#include <asm/tlb.h> 25#include <asm/tlb.h>
25#include <asm/pgalloc.h> 26#include <asm/pgalloc.h>
@@ -62,12 +63,11 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
62static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; 63static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
63 64
64static int khugepaged(void *none); 65static int khugepaged(void *none);
65static int mm_slots_hash_init(void);
66static int khugepaged_slab_init(void); 66static int khugepaged_slab_init(void);
67static void khugepaged_slab_free(void);
68 67
69#define MM_SLOTS_HASH_HEADS 1024 68#define MM_SLOTS_HASH_BITS 10
70static struct hlist_head *mm_slots_hash __read_mostly; 69static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
70
71static struct kmem_cache *mm_slot_cache __read_mostly; 71static struct kmem_cache *mm_slot_cache __read_mostly;
72 72
73/** 73/**
@@ -105,7 +105,6 @@ static int set_recommended_min_free_kbytes(void)
105 struct zone *zone; 105 struct zone *zone;
106 int nr_zones = 0; 106 int nr_zones = 0;
107 unsigned long recommended_min; 107 unsigned long recommended_min;
108 extern int min_free_kbytes;
109 108
110 if (!khugepaged_enabled()) 109 if (!khugepaged_enabled())
111 return 0; 110 return 0;
@@ -634,12 +633,6 @@ static int __init hugepage_init(void)
634 if (err) 633 if (err)
635 goto out; 634 goto out;
636 635
637 err = mm_slots_hash_init();
638 if (err) {
639 khugepaged_slab_free();
640 goto out;
641 }
642
643 register_shrinker(&huge_zero_page_shrinker); 636 register_shrinker(&huge_zero_page_shrinker);
644 637
645 /* 638 /*
@@ -1302,7 +1295,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1302 int target_nid; 1295 int target_nid;
1303 int current_nid = -1; 1296 int current_nid = -1;
1304 bool migrated; 1297 bool migrated;
1305 bool page_locked = false;
1306 1298
1307 spin_lock(&mm->page_table_lock); 1299 spin_lock(&mm->page_table_lock);
1308 if (unlikely(!pmd_same(pmd, *pmdp))) 1300 if (unlikely(!pmd_same(pmd, *pmdp)))
@@ -1324,7 +1316,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1324 /* Acquire the page lock to serialise THP migrations */ 1316 /* Acquire the page lock to serialise THP migrations */
1325 spin_unlock(&mm->page_table_lock); 1317 spin_unlock(&mm->page_table_lock);
1326 lock_page(page); 1318 lock_page(page);
1327 page_locked = true;
1328 1319
1329 /* Confirm the PTE did not while locked */ 1320 /* Confirm the PTE did not while locked */
1330 spin_lock(&mm->page_table_lock); 1321 spin_lock(&mm->page_table_lock);
@@ -1337,34 +1328,26 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1337 1328
1338 /* Migrate the THP to the requested node */ 1329 /* Migrate the THP to the requested node */
1339 migrated = migrate_misplaced_transhuge_page(mm, vma, 1330 migrated = migrate_misplaced_transhuge_page(mm, vma,
1340 pmdp, pmd, addr, 1331 pmdp, pmd, addr, page, target_nid);
1341 page, target_nid); 1332 if (!migrated)
1342 if (migrated) 1333 goto check_same;
1343 current_nid = target_nid;
1344 else {
1345 spin_lock(&mm->page_table_lock);
1346 if (unlikely(!pmd_same(pmd, *pmdp))) {
1347 unlock_page(page);
1348 goto out_unlock;
1349 }
1350 goto clear_pmdnuma;
1351 }
1352 1334
1353 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); 1335 task_numa_fault(target_nid, HPAGE_PMD_NR, true);
1354 return 0; 1336 return 0;
1355 1337
1338check_same:
1339 spin_lock(&mm->page_table_lock);
1340 if (unlikely(!pmd_same(pmd, *pmdp)))
1341 goto out_unlock;
1356clear_pmdnuma: 1342clear_pmdnuma:
1357 pmd = pmd_mknonnuma(pmd); 1343 pmd = pmd_mknonnuma(pmd);
1358 set_pmd_at(mm, haddr, pmdp, pmd); 1344 set_pmd_at(mm, haddr, pmdp, pmd);
1359 VM_BUG_ON(pmd_numa(*pmdp)); 1345 VM_BUG_ON(pmd_numa(*pmdp));
1360 update_mmu_cache_pmd(vma, addr, pmdp); 1346 update_mmu_cache_pmd(vma, addr, pmdp);
1361 if (page_locked)
1362 unlock_page(page);
1363
1364out_unlock: 1347out_unlock:
1365 spin_unlock(&mm->page_table_lock); 1348 spin_unlock(&mm->page_table_lock);
1366 if (current_nid != -1) 1349 if (current_nid != -1)
1367 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); 1350 task_numa_fault(current_nid, HPAGE_PMD_NR, false);
1368 return 0; 1351 return 0;
1369} 1352}
1370 1353
@@ -1656,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page)
1656 page_tail->mapping = page->mapping; 1639 page_tail->mapping = page->mapping;
1657 1640
1658 page_tail->index = page->index + i; 1641 page_tail->index = page->index + i;
1659 page_xchg_last_nid(page_tail, page_last_nid(page)); 1642 page_nid_xchg_last(page_tail, page_nid_last(page));
1660 1643
1661 BUG_ON(!PageAnon(page_tail)); 1644 BUG_ON(!PageAnon(page_tail));
1662 BUG_ON(!PageUptodate(page_tail)); 1645 BUG_ON(!PageUptodate(page_tail));
@@ -1846,7 +1829,7 @@ int split_huge_page(struct page *page)
1846 1829
1847 BUG_ON(PageCompound(page)); 1830 BUG_ON(PageCompound(page));
1848out_unlock: 1831out_unlock:
1849 anon_vma_unlock(anon_vma); 1832 anon_vma_unlock_write(anon_vma);
1850 put_anon_vma(anon_vma); 1833 put_anon_vma(anon_vma);
1851out: 1834out:
1852 return ret; 1835 return ret;
@@ -1908,12 +1891,6 @@ static int __init khugepaged_slab_init(void)
1908 return 0; 1891 return 0;
1909} 1892}
1910 1893
1911static void __init khugepaged_slab_free(void)
1912{
1913 kmem_cache_destroy(mm_slot_cache);
1914 mm_slot_cache = NULL;
1915}
1916
1917static inline struct mm_slot *alloc_mm_slot(void) 1894static inline struct mm_slot *alloc_mm_slot(void)
1918{ 1895{
1919 if (!mm_slot_cache) /* initialization failed */ 1896 if (!mm_slot_cache) /* initialization failed */
@@ -1926,47 +1903,23 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
1926 kmem_cache_free(mm_slot_cache, mm_slot); 1903 kmem_cache_free(mm_slot_cache, mm_slot);
1927} 1904}
1928 1905
1929static int __init mm_slots_hash_init(void)
1930{
1931 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
1932 GFP_KERNEL);
1933 if (!mm_slots_hash)
1934 return -ENOMEM;
1935 return 0;
1936}
1937
1938#if 0
1939static void __init mm_slots_hash_free(void)
1940{
1941 kfree(mm_slots_hash);
1942 mm_slots_hash = NULL;
1943}
1944#endif
1945
1946static struct mm_slot *get_mm_slot(struct mm_struct *mm) 1906static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1947{ 1907{
1948 struct mm_slot *mm_slot; 1908 struct mm_slot *mm_slot;
1949 struct hlist_head *bucket;
1950 struct hlist_node *node; 1909 struct hlist_node *node;
1951 1910
1952 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) 1911 hash_for_each_possible(mm_slots_hash, mm_slot, node, hash, (unsigned long)mm)
1953 % MM_SLOTS_HASH_HEADS];
1954 hlist_for_each_entry(mm_slot, node, bucket, hash) {
1955 if (mm == mm_slot->mm) 1912 if (mm == mm_slot->mm)
1956 return mm_slot; 1913 return mm_slot;
1957 } 1914
1958 return NULL; 1915 return NULL;
1959} 1916}
1960 1917
1961static void insert_to_mm_slots_hash(struct mm_struct *mm, 1918static void insert_to_mm_slots_hash(struct mm_struct *mm,
1962 struct mm_slot *mm_slot) 1919 struct mm_slot *mm_slot)
1963{ 1920{
1964 struct hlist_head *bucket;
1965
1966 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1967 % MM_SLOTS_HASH_HEADS];
1968 mm_slot->mm = mm; 1921 mm_slot->mm = mm;
1969 hlist_add_head(&mm_slot->hash, bucket); 1922 hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
1970} 1923}
1971 1924
1972static inline int khugepaged_test_exit(struct mm_struct *mm) 1925static inline int khugepaged_test_exit(struct mm_struct *mm)
@@ -2035,7 +1988,7 @@ void __khugepaged_exit(struct mm_struct *mm)
2035 spin_lock(&khugepaged_mm_lock); 1988 spin_lock(&khugepaged_mm_lock);
2036 mm_slot = get_mm_slot(mm); 1989 mm_slot = get_mm_slot(mm);
2037 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { 1990 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
2038 hlist_del(&mm_slot->hash); 1991 hash_del(&mm_slot->hash);
2039 list_del(&mm_slot->mm_node); 1992 list_del(&mm_slot->mm_node);
2040 free = 1; 1993 free = 1;
2041 } 1994 }
@@ -2368,7 +2321,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2368 BUG_ON(!pmd_none(*pmd)); 2321 BUG_ON(!pmd_none(*pmd));
2369 set_pmd_at(mm, address, pmd, _pmd); 2322 set_pmd_at(mm, address, pmd, _pmd);
2370 spin_unlock(&mm->page_table_lock); 2323 spin_unlock(&mm->page_table_lock);
2371 anon_vma_unlock(vma->anon_vma); 2324 anon_vma_unlock_write(vma->anon_vma);
2372 goto out; 2325 goto out;
2373 } 2326 }
2374 2327
@@ -2376,7 +2329,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2376 * All pages are isolated and locked so anon_vma rmap 2329 * All pages are isolated and locked so anon_vma rmap
2377 * can't run anymore. 2330 * can't run anymore.
2378 */ 2331 */
2379 anon_vma_unlock(vma->anon_vma); 2332 anon_vma_unlock_write(vma->anon_vma);
2380 2333
2381 __collapse_huge_page_copy(pte, new_page, vma, address, ptl); 2334 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
2382 pte_unmap(pte); 2335 pte_unmap(pte);
@@ -2423,7 +2376,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2423 struct page *page; 2376 struct page *page;
2424 unsigned long _address; 2377 unsigned long _address;
2425 spinlock_t *ptl; 2378 spinlock_t *ptl;
2426 int node = -1; 2379 int node = NUMA_NO_NODE;
2427 2380
2428 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2381 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2429 2382
@@ -2453,7 +2406,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2453 * be more sophisticated and look at more pages, 2406 * be more sophisticated and look at more pages,
2454 * but isn't for now. 2407 * but isn't for now.
2455 */ 2408 */
2456 if (node == -1) 2409 if (node == NUMA_NO_NODE)
2457 node = page_to_nid(page); 2410 node = page_to_nid(page);
2458 VM_BUG_ON(PageCompound(page)); 2411 VM_BUG_ON(PageCompound(page));
2459 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2412 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
@@ -2484,7 +2437,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
2484 2437
2485 if (khugepaged_test_exit(mm)) { 2438 if (khugepaged_test_exit(mm)) {
2486 /* free mm_slot */ 2439 /* free mm_slot */
2487 hlist_del(&mm_slot->hash); 2440 hash_del(&mm_slot->hash);
2488 list_del(&mm_slot->mm_node); 2441 list_del(&mm_slot->mm_node);
2489 2442
2490 /* 2443 /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 546db81820e4..cdb64e4d238a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1293,8 +1293,7 @@ static void __init report_hugepages(void)
1293 1293
1294 for_each_hstate(h) { 1294 for_each_hstate(h) {
1295 char buf[32]; 1295 char buf[32];
1296 printk(KERN_INFO "HugeTLB registered %s page size, " 1296 pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
1297 "pre-allocated %ld pages\n",
1298 memfmt(buf, huge_page_size(h)), 1297 memfmt(buf, huge_page_size(h)),
1299 h->free_huge_pages); 1298 h->free_huge_pages);
1300 } 1299 }
@@ -1702,8 +1701,7 @@ static void __init hugetlb_sysfs_init(void)
1702 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, 1701 err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
1703 hstate_kobjs, &hstate_attr_group); 1702 hstate_kobjs, &hstate_attr_group);
1704 if (err) 1703 if (err)
1705 printk(KERN_ERR "Hugetlb: Unable to add hstate %s", 1704 pr_err("Hugetlb: Unable to add hstate %s", h->name);
1706 h->name);
1707 } 1705 }
1708} 1706}
1709 1707
@@ -1826,9 +1824,8 @@ void hugetlb_register_node(struct node *node)
1826 nhs->hstate_kobjs, 1824 nhs->hstate_kobjs,
1827 &per_node_hstate_attr_group); 1825 &per_node_hstate_attr_group);
1828 if (err) { 1826 if (err) {
1829 printk(KERN_ERR "Hugetlb: Unable to add hstate %s" 1827 pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
1830 " for node %d\n", 1828 h->name, node->dev.id);
1831 h->name, node->dev.id);
1832 hugetlb_unregister_node(node); 1829 hugetlb_unregister_node(node);
1833 break; 1830 break;
1834 } 1831 }
@@ -1924,7 +1921,7 @@ void __init hugetlb_add_hstate(unsigned order)
1924 unsigned long i; 1921 unsigned long i;
1925 1922
1926 if (size_to_hstate(PAGE_SIZE << order)) { 1923 if (size_to_hstate(PAGE_SIZE << order)) {
1927 printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); 1924 pr_warning("hugepagesz= specified twice, ignoring\n");
1928 return; 1925 return;
1929 } 1926 }
1930 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); 1927 BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -1960,8 +1957,8 @@ static int __init hugetlb_nrpages_setup(char *s)
1960 mhp = &parsed_hstate->max_huge_pages; 1957 mhp = &parsed_hstate->max_huge_pages;
1961 1958
1962 if (mhp == last_mhp) { 1959 if (mhp == last_mhp) {
1963 printk(KERN_WARNING "hugepages= specified twice without " 1960 pr_warning("hugepages= specified twice without "
1964 "interleaving hugepagesz=, ignoring\n"); 1961 "interleaving hugepagesz=, ignoring\n");
1965 return 1; 1962 return 1;
1966 } 1963 }
1967 1964
@@ -2692,9 +2689,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
2692 * COW. Warn that such a situation has occurred as it may not be obvious 2689 * COW. Warn that such a situation has occurred as it may not be obvious
2693 */ 2690 */
2694 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { 2691 if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
2695 printk(KERN_WARNING 2692 pr_warning("PID %d killed due to inadequate hugepage pool\n",
2696 "PID %d killed due to inadequate hugepage pool\n", 2693 current->pid);
2697 current->pid);
2698 return ret; 2694 return ret;
2699 } 2695 }
2700 2696
@@ -2924,14 +2920,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
2924 return NULL; 2920 return NULL;
2925} 2921}
2926 2922
2927int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2923long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2928 struct page **pages, struct vm_area_struct **vmas, 2924 struct page **pages, struct vm_area_struct **vmas,
2929 unsigned long *position, int *length, int i, 2925 unsigned long *position, unsigned long *nr_pages,
2930 unsigned int flags) 2926 long i, unsigned int flags)
2931{ 2927{
2932 unsigned long pfn_offset; 2928 unsigned long pfn_offset;
2933 unsigned long vaddr = *position; 2929 unsigned long vaddr = *position;
2934 int remainder = *length; 2930 unsigned long remainder = *nr_pages;
2935 struct hstate *h = hstate_vma(vma); 2931 struct hstate *h = hstate_vma(vma);
2936 2932
2937 spin_lock(&mm->page_table_lock); 2933 spin_lock(&mm->page_table_lock);
@@ -3001,7 +2997,7 @@ same_page:
3001 } 2997 }
3002 } 2998 }
3003 spin_unlock(&mm->page_table_lock); 2999 spin_unlock(&mm->page_table_lock);
3004 *length = remainder; 3000 *nr_pages = remainder;
3005 *position = vaddr; 3001 *position = vaddr;
3006 3002
3007 return i ? i : -EFAULT; 3003 return i ? i : -EFAULT;
diff --git a/mm/internal.h b/mm/internal.h
index 9ba21100ebf3..1c0c4cc0fcf7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -162,8 +162,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
162 struct vm_area_struct *prev, struct rb_node *rb_parent); 162 struct vm_area_struct *prev, struct rb_node *rb_parent);
163 163
164#ifdef CONFIG_MMU 164#ifdef CONFIG_MMU
165extern long mlock_vma_pages_range(struct vm_area_struct *vma, 165extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
166 unsigned long start, unsigned long end); 166 unsigned long start, unsigned long end, int *nonblocking);
167extern void munlock_vma_pages_range(struct vm_area_struct *vma, 167extern void munlock_vma_pages_range(struct vm_area_struct *vma,
168 unsigned long start, unsigned long end); 168 unsigned long start, unsigned long end);
169static inline void munlock_vma_pages_all(struct vm_area_struct *vma) 169static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 752a705c77c2..83dd5fbf5e60 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1300,9 +1300,8 @@ static void kmemleak_scan(void)
1300 */ 1300 */
1301 lock_memory_hotplug(); 1301 lock_memory_hotplug();
1302 for_each_online_node(i) { 1302 for_each_online_node(i) {
1303 pg_data_t *pgdat = NODE_DATA(i); 1303 unsigned long start_pfn = node_start_pfn(i);
1304 unsigned long start_pfn = pgdat->node_start_pfn; 1304 unsigned long end_pfn = node_end_pfn(i);
1305 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
1306 unsigned long pfn; 1305 unsigned long pfn;
1307 1306
1308 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1307 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
diff --git a/mm/ksm.c b/mm/ksm.c
index 51573858938d..ab2ba9ad3c59 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,13 +33,22 @@
33#include <linux/mmu_notifier.h> 33#include <linux/mmu_notifier.h>
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/ksm.h> 35#include <linux/ksm.h>
36#include <linux/hash.h> 36#include <linux/hashtable.h>
37#include <linux/freezer.h> 37#include <linux/freezer.h>
38#include <linux/oom.h> 38#include <linux/oom.h>
39#include <linux/numa.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include "internal.h" 42#include "internal.h"
42 43
44#ifdef CONFIG_NUMA
45#define NUMA(x) (x)
46#define DO_NUMA(x) do { (x); } while (0)
47#else
48#define NUMA(x) (0)
49#define DO_NUMA(x) do { } while (0)
50#endif
51
43/* 52/*
44 * A few notes about the KSM scanning process, 53 * A few notes about the KSM scanning process,
45 * to make it easier to understand the data structures below: 54 * to make it easier to understand the data structures below:
@@ -78,6 +87,9 @@
78 * take 10 attempts to find a page in the unstable tree, once it is found, 87 * take 10 attempts to find a page in the unstable tree, once it is found,
79 * it is secured in the stable tree. (When we scan a new page, we first 88 * it is secured in the stable tree. (When we scan a new page, we first
80 * compare it against the stable tree, and then against the unstable tree.) 89 * compare it against the stable tree, and then against the unstable tree.)
90 *
91 * If the merge_across_nodes tunable is unset, then KSM maintains multiple
92 * stable trees and multiple unstable trees: one of each for each NUMA node.
81 */ 93 */
82 94
83/** 95/**
@@ -113,19 +125,32 @@ struct ksm_scan {
113/** 125/**
114 * struct stable_node - node of the stable rbtree 126 * struct stable_node - node of the stable rbtree
115 * @node: rb node of this ksm page in the stable tree 127 * @node: rb node of this ksm page in the stable tree
128 * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
129 * @list: linked into migrate_nodes, pending placement in the proper node tree
116 * @hlist: hlist head of rmap_items using this ksm page 130 * @hlist: hlist head of rmap_items using this ksm page
117 * @kpfn: page frame number of this ksm page 131 * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
132 * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
118 */ 133 */
119struct stable_node { 134struct stable_node {
120 struct rb_node node; 135 union {
136 struct rb_node node; /* when node of stable tree */
137 struct { /* when listed for migration */
138 struct list_head *head;
139 struct list_head list;
140 };
141 };
121 struct hlist_head hlist; 142 struct hlist_head hlist;
122 unsigned long kpfn; 143 unsigned long kpfn;
144#ifdef CONFIG_NUMA
145 int nid;
146#endif
123}; 147};
124 148
125/** 149/**
126 * struct rmap_item - reverse mapping item for virtual addresses 150 * struct rmap_item - reverse mapping item for virtual addresses
127 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list 151 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
128 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree 152 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
153 * @nid: NUMA node id of unstable tree in which linked (may not match page)
129 * @mm: the memory structure this rmap_item is pointing into 154 * @mm: the memory structure this rmap_item is pointing into
130 * @address: the virtual address this rmap_item tracks (+ flags in low bits) 155 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
131 * @oldchecksum: previous checksum of the page at that virtual address 156 * @oldchecksum: previous checksum of the page at that virtual address
@@ -135,7 +160,12 @@ struct stable_node {
135 */ 160 */
136struct rmap_item { 161struct rmap_item {
137 struct rmap_item *rmap_list; 162 struct rmap_item *rmap_list;
138 struct anon_vma *anon_vma; /* when stable */ 163 union {
164 struct anon_vma *anon_vma; /* when stable */
165#ifdef CONFIG_NUMA
166 int nid; /* when node of unstable tree */
167#endif
168 };
139 struct mm_struct *mm; 169 struct mm_struct *mm;
140 unsigned long address; /* + low bits used for flags below */ 170 unsigned long address; /* + low bits used for flags below */
141 unsigned int oldchecksum; /* when unstable */ 171 unsigned int oldchecksum; /* when unstable */
@@ -153,12 +183,16 @@ struct rmap_item {
153#define STABLE_FLAG 0x200 /* is listed from the stable tree */ 183#define STABLE_FLAG 0x200 /* is listed from the stable tree */
154 184
155/* The stable and unstable tree heads */ 185/* The stable and unstable tree heads */
156static struct rb_root root_stable_tree = RB_ROOT; 186static struct rb_root one_stable_tree[1] = { RB_ROOT };
157static struct rb_root root_unstable_tree = RB_ROOT; 187static struct rb_root one_unstable_tree[1] = { RB_ROOT };
188static struct rb_root *root_stable_tree = one_stable_tree;
189static struct rb_root *root_unstable_tree = one_unstable_tree;
158 190
159#define MM_SLOTS_HASH_SHIFT 10 191/* Recently migrated nodes of stable tree, pending proper placement */
160#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) 192static LIST_HEAD(migrate_nodes);
161static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; 193
194#define MM_SLOTS_HASH_BITS 10
195static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
162 196
163static struct mm_slot ksm_mm_head = { 197static struct mm_slot ksm_mm_head = {
164 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), 198 .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -189,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100;
189/* Milliseconds ksmd should sleep between batches */ 223/* Milliseconds ksmd should sleep between batches */
190static unsigned int ksm_thread_sleep_millisecs = 20; 224static unsigned int ksm_thread_sleep_millisecs = 20;
191 225
226#ifdef CONFIG_NUMA
227/* Zeroed when merging across nodes is not allowed */
228static unsigned int ksm_merge_across_nodes = 1;
229static int ksm_nr_node_ids = 1;
230#else
231#define ksm_merge_across_nodes 1U
232#define ksm_nr_node_ids 1
233#endif
234
192#define KSM_RUN_STOP 0 235#define KSM_RUN_STOP 0
193#define KSM_RUN_MERGE 1 236#define KSM_RUN_MERGE 1
194#define KSM_RUN_UNMERGE 2 237#define KSM_RUN_UNMERGE 2
195static unsigned int ksm_run = KSM_RUN_STOP; 238#define KSM_RUN_OFFLINE 4
239static unsigned long ksm_run = KSM_RUN_STOP;
240static void wait_while_offlining(void);
196 241
197static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); 242static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
198static DEFINE_MUTEX(ksm_thread_mutex); 243static DEFINE_MUTEX(ksm_thread_mutex);
@@ -275,31 +320,21 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
275 320
276static struct mm_slot *get_mm_slot(struct mm_struct *mm) 321static struct mm_slot *get_mm_slot(struct mm_struct *mm)
277{ 322{
278 struct mm_slot *mm_slot;
279 struct hlist_head *bucket;
280 struct hlist_node *node; 323 struct hlist_node *node;
324 struct mm_slot *slot;
325
326 hash_for_each_possible(mm_slots_hash, slot, node, link, (unsigned long)mm)
327 if (slot->mm == mm)
328 return slot;
281 329
282 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
283 hlist_for_each_entry(mm_slot, node, bucket, link) {
284 if (mm == mm_slot->mm)
285 return mm_slot;
286 }
287 return NULL; 330 return NULL;
288} 331}
289 332
290static void insert_to_mm_slots_hash(struct mm_struct *mm, 333static void insert_to_mm_slots_hash(struct mm_struct *mm,
291 struct mm_slot *mm_slot) 334 struct mm_slot *mm_slot)
292{ 335{
293 struct hlist_head *bucket;
294
295 bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
296 mm_slot->mm = mm; 336 mm_slot->mm = mm;
297 hlist_add_head(&mm_slot->link, bucket); 337 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
298}
299
300static inline int in_stable_tree(struct rmap_item *rmap_item)
301{
302 return rmap_item->address & STABLE_FLAG;
303} 338}
304 339
305/* 340/*
@@ -333,7 +368,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
333 368
334 do { 369 do {
335 cond_resched(); 370 cond_resched();
336 page = follow_page(vma, addr, FOLL_GET); 371 page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
337 if (IS_ERR_OR_NULL(page)) 372 if (IS_ERR_OR_NULL(page))
338 break; 373 break;
339 if (PageKsm(page)) 374 if (PageKsm(page))
@@ -447,6 +482,17 @@ out: page = NULL;
447 return page; 482 return page;
448} 483}
449 484
485/*
486 * This helper is used for getting right index into array of tree roots.
487 * When merge_across_nodes knob is set to 1, there are only two rb-trees for
488 * stable and unstable pages from all nodes with roots in index 0. Otherwise,
489 * every node has its own stable and unstable tree.
490 */
491static inline int get_kpfn_nid(unsigned long kpfn)
492{
493 return ksm_merge_across_nodes ? 0 : pfn_to_nid(kpfn);
494}
495
450static void remove_node_from_stable_tree(struct stable_node *stable_node) 496static void remove_node_from_stable_tree(struct stable_node *stable_node)
451{ 497{
452 struct rmap_item *rmap_item; 498 struct rmap_item *rmap_item;
@@ -462,7 +508,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
462 cond_resched(); 508 cond_resched();
463 } 509 }
464 510
465 rb_erase(&stable_node->node, &root_stable_tree); 511 if (stable_node->head == &migrate_nodes)
512 list_del(&stable_node->list);
513 else
514 rb_erase(&stable_node->node,
515 root_stable_tree + NUMA(stable_node->nid));
466 free_stable_node(stable_node); 516 free_stable_node(stable_node);
467} 517}
468 518
@@ -472,6 +522,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
472 * In which case we can trust the content of the page, and it 522 * In which case we can trust the content of the page, and it
473 * returns the gotten page; but if the page has now been zapped, 523 * returns the gotten page; but if the page has now been zapped,
474 * remove the stale node from the stable tree and return NULL. 524 * remove the stale node from the stable tree and return NULL.
525 * But beware, the stable node's page might be being migrated.
475 * 526 *
476 * You would expect the stable_node to hold a reference to the ksm page. 527 * You would expect the stable_node to hold a reference to the ksm page.
477 * But if it increments the page's count, swapping out has to wait for 528 * But if it increments the page's count, swapping out has to wait for
@@ -482,40 +533,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
482 * pointing back to this stable node. This relies on freeing a PageAnon 533 * pointing back to this stable node. This relies on freeing a PageAnon
483 * page to reset its page->mapping to NULL, and relies on no other use of 534 * page to reset its page->mapping to NULL, and relies on no other use of
484 * a page to put something that might look like our key in page->mapping. 535 * a page to put something that might look like our key in page->mapping.
485 *
486 * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
487 * but this is different - made simpler by ksm_thread_mutex being held, but
488 * interesting for assuming that no other use of the struct page could ever
489 * put our expected_mapping into page->mapping (or a field of the union which
490 * coincides with page->mapping). The RCU calls are not for KSM at all, but
491 * to keep the page_count protocol described with page_cache_get_speculative.
492 *
493 * Note: it is possible that get_ksm_page() will return NULL one moment,
494 * then page the next, if the page is in between page_freeze_refs() and
495 * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
496 * is on its way to being freed; but it is an anomaly to bear in mind. 536 * is on its way to being freed; but it is an anomaly to bear in mind.
497 */ 537 */
498static struct page *get_ksm_page(struct stable_node *stable_node) 538static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
499{ 539{
500 struct page *page; 540 struct page *page;
501 void *expected_mapping; 541 void *expected_mapping;
542 unsigned long kpfn;
502 543
503 page = pfn_to_page(stable_node->kpfn);
504 expected_mapping = (void *)stable_node + 544 expected_mapping = (void *)stable_node +
505 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 545 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
506 rcu_read_lock(); 546again:
507 if (page->mapping != expected_mapping) 547 kpfn = ACCESS_ONCE(stable_node->kpfn);
508 goto stale; 548 page = pfn_to_page(kpfn);
509 if (!get_page_unless_zero(page)) 549
550 /*
551 * page is computed from kpfn, so on most architectures reading
552 * page->mapping is naturally ordered after reading node->kpfn,
553 * but on Alpha we need to be more careful.
554 */
555 smp_read_barrier_depends();
556 if (ACCESS_ONCE(page->mapping) != expected_mapping)
510 goto stale; 557 goto stale;
511 if (page->mapping != expected_mapping) { 558
559 /*
560 * We cannot do anything with the page while its refcount is 0.
561 * Usually 0 means free, or tail of a higher-order page: in which
562 * case this node is no longer referenced, and should be freed;
563 * however, it might mean that the page is under page_freeze_refs().
564 * The __remove_mapping() case is easy, again the node is now stale;
565 * but if page is swapcache in migrate_page_move_mapping(), it might
566 * still be our page, in which case it's essential to keep the node.
567 */
568 while (!get_page_unless_zero(page)) {
569 /*
570 * Another check for page->mapping != expected_mapping would
571 * work here too. We have chosen the !PageSwapCache test to
572 * optimize the common case, when the page is or is about to
573 * be freed: PageSwapCache is cleared (under spin_lock_irq)
574 * in the freeze_refs section of __remove_mapping(); but Anon
575 * page->mapping reset to NULL later, in free_pages_prepare().
576 */
577 if (!PageSwapCache(page))
578 goto stale;
579 cpu_relax();
580 }
581
582 if (ACCESS_ONCE(page->mapping) != expected_mapping) {
512 put_page(page); 583 put_page(page);
513 goto stale; 584 goto stale;
514 } 585 }
515 rcu_read_unlock(); 586
587 if (lock_it) {
588 lock_page(page);
589 if (ACCESS_ONCE(page->mapping) != expected_mapping) {
590 unlock_page(page);
591 put_page(page);
592 goto stale;
593 }
594 }
516 return page; 595 return page;
596
517stale: 597stale:
518 rcu_read_unlock(); 598 /*
599 * We come here from above when page->mapping or !PageSwapCache
600 * suggests that the node is stale; but it might be under migration.
601 * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
602 * before checking whether node->kpfn has been changed.
603 */
604 smp_rmb();
605 if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
606 goto again;
519 remove_node_from_stable_tree(stable_node); 607 remove_node_from_stable_tree(stable_node);
520 return NULL; 608 return NULL;
521} 609}
@@ -531,11 +619,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
531 struct page *page; 619 struct page *page;
532 620
533 stable_node = rmap_item->head; 621 stable_node = rmap_item->head;
534 page = get_ksm_page(stable_node); 622 page = get_ksm_page(stable_node, true);
535 if (!page) 623 if (!page)
536 goto out; 624 goto out;
537 625
538 lock_page(page);
539 hlist_del(&rmap_item->hlist); 626 hlist_del(&rmap_item->hlist);
540 unlock_page(page); 627 unlock_page(page);
541 put_page(page); 628 put_page(page);
@@ -560,8 +647,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
560 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); 647 age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
561 BUG_ON(age > 1); 648 BUG_ON(age > 1);
562 if (!age) 649 if (!age)
563 rb_erase(&rmap_item->node, &root_unstable_tree); 650 rb_erase(&rmap_item->node,
564 651 root_unstable_tree + NUMA(rmap_item->nid));
565 ksm_pages_unshared--; 652 ksm_pages_unshared--;
566 rmap_item->address &= PAGE_MASK; 653 rmap_item->address &= PAGE_MASK;
567 } 654 }
@@ -581,7 +668,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
581} 668}
582 669
583/* 670/*
584 * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather 671 * Though it's very tempting to unmerge rmap_items from stable tree rather
585 * than check every pte of a given vma, the locking doesn't quite work for 672 * than check every pte of a given vma, the locking doesn't quite work for
586 * that - an rmap_item is assigned to the stable tree after inserting ksm 673 * that - an rmap_item is assigned to the stable tree after inserting ksm
587 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing 674 * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
@@ -614,6 +701,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
614/* 701/*
615 * Only called through the sysfs control interface: 702 * Only called through the sysfs control interface:
616 */ 703 */
704static int remove_stable_node(struct stable_node *stable_node)
705{
706 struct page *page;
707 int err;
708
709 page = get_ksm_page(stable_node, true);
710 if (!page) {
711 /*
712 * get_ksm_page did remove_node_from_stable_tree itself.
713 */
714 return 0;
715 }
716
717 if (WARN_ON_ONCE(page_mapped(page))) {
718 /*
719 * This should not happen: but if it does, just refuse to let
720 * merge_across_nodes be switched - there is no need to panic.
721 */
722 err = -EBUSY;
723 } else {
724 /*
725 * The stable node did not yet appear stale to get_ksm_page(),
726 * since that allows for an unmapped ksm page to be recognized
727 * right up until it is freed; but the node is safe to remove.
728 * This page might be in a pagevec waiting to be freed,
729 * or it might be PageSwapCache (perhaps under writeback),
730 * or it might have been removed from swapcache a moment ago.
731 */
732 set_page_stable_node(page, NULL);
733 remove_node_from_stable_tree(stable_node);
734 err = 0;
735 }
736
737 unlock_page(page);
738 put_page(page);
739 return err;
740}
741
742static int remove_all_stable_nodes(void)
743{
744 struct stable_node *stable_node;
745 struct list_head *this, *next;
746 int nid;
747 int err = 0;
748
749 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
750 while (root_stable_tree[nid].rb_node) {
751 stable_node = rb_entry(root_stable_tree[nid].rb_node,
752 struct stable_node, node);
753 if (remove_stable_node(stable_node)) {
754 err = -EBUSY;
755 break; /* proceed to next nid */
756 }
757 cond_resched();
758 }
759 }
760 list_for_each_safe(this, next, &migrate_nodes) {
761 stable_node = list_entry(this, struct stable_node, list);
762 if (remove_stable_node(stable_node))
763 err = -EBUSY;
764 cond_resched();
765 }
766 return err;
767}
768
617static int unmerge_and_remove_all_rmap_items(void) 769static int unmerge_and_remove_all_rmap_items(void)
618{ 770{
619 struct mm_slot *mm_slot; 771 struct mm_slot *mm_slot;
@@ -647,7 +799,7 @@ static int unmerge_and_remove_all_rmap_items(void)
647 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, 799 ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
648 struct mm_slot, mm_list); 800 struct mm_slot, mm_list);
649 if (ksm_test_exit(mm)) { 801 if (ksm_test_exit(mm)) {
650 hlist_del(&mm_slot->link); 802 hash_del(&mm_slot->link);
651 list_del(&mm_slot->mm_list); 803 list_del(&mm_slot->mm_list);
652 spin_unlock(&ksm_mmlist_lock); 804 spin_unlock(&ksm_mmlist_lock);
653 805
@@ -661,6 +813,8 @@ static int unmerge_and_remove_all_rmap_items(void)
661 } 813 }
662 } 814 }
663 815
816 /* Clean up stable nodes, but don't worry if some are still busy */
817 remove_all_stable_nodes();
664 ksm_scan.seqnr = 0; 818 ksm_scan.seqnr = 0;
665 return 0; 819 return 0;
666 820
@@ -946,6 +1100,9 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
946 if (err) 1100 if (err)
947 goto out; 1101 goto out;
948 1102
1103 /* Unstable nid is in union with stable anon_vma: remove first */
1104 remove_rmap_item_from_tree(rmap_item);
1105
949 /* Must get reference to anon_vma while still holding mmap_sem */ 1106 /* Must get reference to anon_vma while still holding mmap_sem */
950 rmap_item->anon_vma = vma->anon_vma; 1107 rmap_item->anon_vma = vma->anon_vma;
951 get_anon_vma(vma->anon_vma); 1108 get_anon_vma(vma->anon_vma);
@@ -996,42 +1153,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
996 */ 1153 */
997static struct page *stable_tree_search(struct page *page) 1154static struct page *stable_tree_search(struct page *page)
998{ 1155{
999 struct rb_node *node = root_stable_tree.rb_node; 1156 int nid;
1157 struct rb_root *root;
1158 struct rb_node **new;
1159 struct rb_node *parent;
1000 struct stable_node *stable_node; 1160 struct stable_node *stable_node;
1161 struct stable_node *page_node;
1001 1162
1002 stable_node = page_stable_node(page); 1163 page_node = page_stable_node(page);
1003 if (stable_node) { /* ksm page forked */ 1164 if (page_node && page_node->head != &migrate_nodes) {
1165 /* ksm page forked */
1004 get_page(page); 1166 get_page(page);
1005 return page; 1167 return page;
1006 } 1168 }
1007 1169
1008 while (node) { 1170 nid = get_kpfn_nid(page_to_pfn(page));
1171 root = root_stable_tree + nid;
1172again:
1173 new = &root->rb_node;
1174 parent = NULL;
1175
1176 while (*new) {
1009 struct page *tree_page; 1177 struct page *tree_page;
1010 int ret; 1178 int ret;
1011 1179
1012 cond_resched(); 1180 cond_resched();
1013 stable_node = rb_entry(node, struct stable_node, node); 1181 stable_node = rb_entry(*new, struct stable_node, node);
1014 tree_page = get_ksm_page(stable_node); 1182 tree_page = get_ksm_page(stable_node, false);
1015 if (!tree_page) 1183 if (!tree_page)
1016 return NULL; 1184 return NULL;
1017 1185
1018 ret = memcmp_pages(page, tree_page); 1186 ret = memcmp_pages(page, tree_page);
1187 put_page(tree_page);
1019 1188
1020 if (ret < 0) { 1189 parent = *new;
1021 put_page(tree_page); 1190 if (ret < 0)
1022 node = node->rb_left; 1191 new = &parent->rb_left;
1023 } else if (ret > 0) { 1192 else if (ret > 0)
1024 put_page(tree_page); 1193 new = &parent->rb_right;
1025 node = node->rb_right; 1194 else {
1026 } else 1195 /*
1027 return tree_page; 1196 * Lock and unlock the stable_node's page (which
1197 * might already have been migrated) so that page
1198 * migration is sure to notice its raised count.
1199 * It would be more elegant to return stable_node
1200 * than kpage, but that involves more changes.
1201 */
1202 tree_page = get_ksm_page(stable_node, true);
1203 if (tree_page) {
1204 unlock_page(tree_page);
1205 if (get_kpfn_nid(stable_node->kpfn) !=
1206 NUMA(stable_node->nid)) {
1207 put_page(tree_page);
1208 goto replace;
1209 }
1210 return tree_page;
1211 }
1212 /*
1213 * There is now a place for page_node, but the tree may
1214 * have been rebalanced, so re-evaluate parent and new.
1215 */
1216 if (page_node)
1217 goto again;
1218 return NULL;
1219 }
1028 } 1220 }
1029 1221
1030 return NULL; 1222 if (!page_node)
1223 return NULL;
1224
1225 list_del(&page_node->list);
1226 DO_NUMA(page_node->nid = nid);
1227 rb_link_node(&page_node->node, parent, new);
1228 rb_insert_color(&page_node->node, root);
1229 get_page(page);
1230 return page;
1231
1232replace:
1233 if (page_node) {
1234 list_del(&page_node->list);
1235 DO_NUMA(page_node->nid = nid);
1236 rb_replace_node(&stable_node->node, &page_node->node, root);
1237 get_page(page);
1238 } else {
1239 rb_erase(&stable_node->node, root);
1240 page = NULL;
1241 }
1242 stable_node->head = &migrate_nodes;
1243 list_add(&stable_node->list, stable_node->head);
1244 return page;
1031} 1245}
1032 1246
1033/* 1247/*
1034 * stable_tree_insert - insert rmap_item pointing to new ksm page 1248 * stable_tree_insert - insert stable tree node pointing to new ksm page
1035 * into the stable tree. 1249 * into the stable tree.
1036 * 1250 *
1037 * This function returns the stable tree node just allocated on success, 1251 * This function returns the stable tree node just allocated on success,
@@ -1039,17 +1253,25 @@ static struct page *stable_tree_search(struct page *page)
1039 */ 1253 */
1040static struct stable_node *stable_tree_insert(struct page *kpage) 1254static struct stable_node *stable_tree_insert(struct page *kpage)
1041{ 1255{
1042 struct rb_node **new = &root_stable_tree.rb_node; 1256 int nid;
1257 unsigned long kpfn;
1258 struct rb_root *root;
1259 struct rb_node **new;
1043 struct rb_node *parent = NULL; 1260 struct rb_node *parent = NULL;
1044 struct stable_node *stable_node; 1261 struct stable_node *stable_node;
1045 1262
1263 kpfn = page_to_pfn(kpage);
1264 nid = get_kpfn_nid(kpfn);
1265 root = root_stable_tree + nid;
1266 new = &root->rb_node;
1267
1046 while (*new) { 1268 while (*new) {
1047 struct page *tree_page; 1269 struct page *tree_page;
1048 int ret; 1270 int ret;
1049 1271
1050 cond_resched(); 1272 cond_resched();
1051 stable_node = rb_entry(*new, struct stable_node, node); 1273 stable_node = rb_entry(*new, struct stable_node, node);
1052 tree_page = get_ksm_page(stable_node); 1274 tree_page = get_ksm_page(stable_node, false);
1053 if (!tree_page) 1275 if (!tree_page)
1054 return NULL; 1276 return NULL;
1055 1277
@@ -1075,13 +1297,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
1075 if (!stable_node) 1297 if (!stable_node)
1076 return NULL; 1298 return NULL;
1077 1299
1078 rb_link_node(&stable_node->node, parent, new);
1079 rb_insert_color(&stable_node->node, &root_stable_tree);
1080
1081 INIT_HLIST_HEAD(&stable_node->hlist); 1300 INIT_HLIST_HEAD(&stable_node->hlist);
1082 1301 stable_node->kpfn = kpfn;
1083 stable_node->kpfn = page_to_pfn(kpage);
1084 set_page_stable_node(kpage, stable_node); 1302 set_page_stable_node(kpage, stable_node);
1303 DO_NUMA(stable_node->nid = nid);
1304 rb_link_node(&stable_node->node, parent, new);
1305 rb_insert_color(&stable_node->node, root);
1085 1306
1086 return stable_node; 1307 return stable_node;
1087} 1308}
@@ -1104,10 +1325,15 @@ static
1104struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, 1325struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1105 struct page *page, 1326 struct page *page,
1106 struct page **tree_pagep) 1327 struct page **tree_pagep)
1107
1108{ 1328{
1109 struct rb_node **new = &root_unstable_tree.rb_node; 1329 struct rb_node **new;
1330 struct rb_root *root;
1110 struct rb_node *parent = NULL; 1331 struct rb_node *parent = NULL;
1332 int nid;
1333
1334 nid = get_kpfn_nid(page_to_pfn(page));
1335 root = root_unstable_tree + nid;
1336 new = &root->rb_node;
1111 1337
1112 while (*new) { 1338 while (*new) {
1113 struct rmap_item *tree_rmap_item; 1339 struct rmap_item *tree_rmap_item;
@@ -1137,6 +1363,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1137 } else if (ret > 0) { 1363 } else if (ret > 0) {
1138 put_page(tree_page); 1364 put_page(tree_page);
1139 new = &parent->rb_right; 1365 new = &parent->rb_right;
1366 } else if (!ksm_merge_across_nodes &&
1367 page_to_nid(tree_page) != nid) {
1368 /*
1369 * If tree_page has been migrated to another NUMA node,
1370 * it will be flushed out and put in the right unstable
1371 * tree next time: only merge with it when across_nodes.
1372 */
1373 put_page(tree_page);
1374 return NULL;
1140 } else { 1375 } else {
1141 *tree_pagep = tree_page; 1376 *tree_pagep = tree_page;
1142 return tree_rmap_item; 1377 return tree_rmap_item;
@@ -1145,8 +1380,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
1145 1380
1146 rmap_item->address |= UNSTABLE_FLAG; 1381 rmap_item->address |= UNSTABLE_FLAG;
1147 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); 1382 rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
1383 DO_NUMA(rmap_item->nid = nid);
1148 rb_link_node(&rmap_item->node, parent, new); 1384 rb_link_node(&rmap_item->node, parent, new);
1149 rb_insert_color(&rmap_item->node, &root_unstable_tree); 1385 rb_insert_color(&rmap_item->node, root);
1150 1386
1151 ksm_pages_unshared++; 1387 ksm_pages_unshared++;
1152 return NULL; 1388 return NULL;
@@ -1188,10 +1424,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1188 unsigned int checksum; 1424 unsigned int checksum;
1189 int err; 1425 int err;
1190 1426
1191 remove_rmap_item_from_tree(rmap_item); 1427 stable_node = page_stable_node(page);
1428 if (stable_node) {
1429 if (stable_node->head != &migrate_nodes &&
1430 get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
1431 rb_erase(&stable_node->node,
1432 root_stable_tree + NUMA(stable_node->nid));
1433 stable_node->head = &migrate_nodes;
1434 list_add(&stable_node->list, stable_node->head);
1435 }
1436 if (stable_node->head != &migrate_nodes &&
1437 rmap_item->head == stable_node)
1438 return;
1439 }
1192 1440
1193 /* We first start with searching the page inside the stable tree */ 1441 /* We first start with searching the page inside the stable tree */
1194 kpage = stable_tree_search(page); 1442 kpage = stable_tree_search(page);
1443 if (kpage == page && rmap_item->head == stable_node) {
1444 put_page(kpage);
1445 return;
1446 }
1447
1448 remove_rmap_item_from_tree(rmap_item);
1449
1195 if (kpage) { 1450 if (kpage) {
1196 err = try_to_merge_with_ksm_page(rmap_item, page, kpage); 1451 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1197 if (!err) { 1452 if (!err) {
@@ -1225,14 +1480,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
1225 kpage = try_to_merge_two_pages(rmap_item, page, 1480 kpage = try_to_merge_two_pages(rmap_item, page,
1226 tree_rmap_item, tree_page); 1481 tree_rmap_item, tree_page);
1227 put_page(tree_page); 1482 put_page(tree_page);
1228 /*
1229 * As soon as we merge this page, we want to remove the
1230 * rmap_item of the page we have merged with from the unstable
1231 * tree, and insert it instead as new node in the stable tree.
1232 */
1233 if (kpage) { 1483 if (kpage) {
1234 remove_rmap_item_from_tree(tree_rmap_item); 1484 /*
1235 1485 * The pages were successfully merged: insert new
1486 * node in the stable tree and add both rmap_items.
1487 */
1236 lock_page(kpage); 1488 lock_page(kpage);
1237 stable_node = stable_tree_insert(kpage); 1489 stable_node = stable_tree_insert(kpage);
1238 if (stable_node) { 1490 if (stable_node) {
@@ -1289,6 +1541,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1289 struct mm_slot *slot; 1541 struct mm_slot *slot;
1290 struct vm_area_struct *vma; 1542 struct vm_area_struct *vma;
1291 struct rmap_item *rmap_item; 1543 struct rmap_item *rmap_item;
1544 int nid;
1292 1545
1293 if (list_empty(&ksm_mm_head.mm_list)) 1546 if (list_empty(&ksm_mm_head.mm_list))
1294 return NULL; 1547 return NULL;
@@ -1307,7 +1560,29 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1307 */ 1560 */
1308 lru_add_drain_all(); 1561 lru_add_drain_all();
1309 1562
1310 root_unstable_tree = RB_ROOT; 1563 /*
1564 * Whereas stale stable_nodes on the stable_tree itself
1565 * get pruned in the regular course of stable_tree_search(),
1566 * those moved out to the migrate_nodes list can accumulate:
1567 * so prune them once before each full scan.
1568 */
1569 if (!ksm_merge_across_nodes) {
1570 struct stable_node *stable_node;
1571 struct list_head *this, *next;
1572 struct page *page;
1573
1574 list_for_each_safe(this, next, &migrate_nodes) {
1575 stable_node = list_entry(this,
1576 struct stable_node, list);
1577 page = get_ksm_page(stable_node, false);
1578 if (page)
1579 put_page(page);
1580 cond_resched();
1581 }
1582 }
1583
1584 for (nid = 0; nid < ksm_nr_node_ids; nid++)
1585 root_unstable_tree[nid] = RB_ROOT;
1311 1586
1312 spin_lock(&ksm_mmlist_lock); 1587 spin_lock(&ksm_mmlist_lock);
1313 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1588 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
@@ -1392,7 +1667,7 @@ next_mm:
1392 * or when all VM_MERGEABLE areas have been unmapped (and 1667 * or when all VM_MERGEABLE areas have been unmapped (and
1393 * mmap_sem then protects against race with MADV_MERGEABLE). 1668 * mmap_sem then protects against race with MADV_MERGEABLE).
1394 */ 1669 */
1395 hlist_del(&slot->link); 1670 hash_del(&slot->link);
1396 list_del(&slot->mm_list); 1671 list_del(&slot->mm_list);
1397 spin_unlock(&ksm_mmlist_lock); 1672 spin_unlock(&ksm_mmlist_lock);
1398 1673
@@ -1428,8 +1703,7 @@ static void ksm_do_scan(unsigned int scan_npages)
1428 rmap_item = scan_get_next_rmap_item(&page); 1703 rmap_item = scan_get_next_rmap_item(&page);
1429 if (!rmap_item) 1704 if (!rmap_item)
1430 return; 1705 return;
1431 if (!PageKsm(page) || !in_stable_tree(rmap_item)) 1706 cmp_and_merge_page(page, rmap_item);
1432 cmp_and_merge_page(page, rmap_item);
1433 put_page(page); 1707 put_page(page);
1434 } 1708 }
1435} 1709}
@@ -1446,6 +1720,7 @@ static int ksm_scan_thread(void *nothing)
1446 1720
1447 while (!kthread_should_stop()) { 1721 while (!kthread_should_stop()) {
1448 mutex_lock(&ksm_thread_mutex); 1722 mutex_lock(&ksm_thread_mutex);
1723 wait_while_offlining();
1449 if (ksmd_should_run()) 1724 if (ksmd_should_run())
1450 ksm_do_scan(ksm_thread_pages_to_scan); 1725 ksm_do_scan(ksm_thread_pages_to_scan);
1451 mutex_unlock(&ksm_thread_mutex); 1726 mutex_unlock(&ksm_thread_mutex);
@@ -1525,11 +1800,19 @@ int __ksm_enter(struct mm_struct *mm)
1525 spin_lock(&ksm_mmlist_lock); 1800 spin_lock(&ksm_mmlist_lock);
1526 insert_to_mm_slots_hash(mm, mm_slot); 1801 insert_to_mm_slots_hash(mm, mm_slot);
1527 /* 1802 /*
1528 * Insert just behind the scanning cursor, to let the area settle 1803 * When KSM_RUN_MERGE (or KSM_RUN_STOP),
1804 * insert just behind the scanning cursor, to let the area settle
1529 * down a little; when fork is followed by immediate exec, we don't 1805 * down a little; when fork is followed by immediate exec, we don't
1530 * want ksmd to waste time setting up and tearing down an rmap_list. 1806 * want ksmd to waste time setting up and tearing down an rmap_list.
1807 *
1808 * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
1809 * scanning cursor, otherwise KSM pages in newly forked mms will be
1810 * missed: then we might as well insert at the end of the list.
1531 */ 1811 */
1532 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); 1812 if (ksm_run & KSM_RUN_UNMERGE)
1813 list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
1814 else
1815 list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
1533 spin_unlock(&ksm_mmlist_lock); 1816 spin_unlock(&ksm_mmlist_lock);
1534 1817
1535 set_bit(MMF_VM_MERGEABLE, &mm->flags); 1818 set_bit(MMF_VM_MERGEABLE, &mm->flags);
@@ -1559,7 +1842,7 @@ void __ksm_exit(struct mm_struct *mm)
1559 mm_slot = get_mm_slot(mm); 1842 mm_slot = get_mm_slot(mm);
1560 if (mm_slot && ksm_scan.mm_slot != mm_slot) { 1843 if (mm_slot && ksm_scan.mm_slot != mm_slot) {
1561 if (!mm_slot->rmap_list) { 1844 if (!mm_slot->rmap_list) {
1562 hlist_del(&mm_slot->link); 1845 hash_del(&mm_slot->link);
1563 list_del(&mm_slot->mm_list); 1846 list_del(&mm_slot->mm_list);
1564 easy_to_free = 1; 1847 easy_to_free = 1;
1565 } else { 1848 } else {
@@ -1579,24 +1862,32 @@ void __ksm_exit(struct mm_struct *mm)
1579 } 1862 }
1580} 1863}
1581 1864
1582struct page *ksm_does_need_to_copy(struct page *page, 1865struct page *ksm_might_need_to_copy(struct page *page,
1583 struct vm_area_struct *vma, unsigned long address) 1866 struct vm_area_struct *vma, unsigned long address)
1584{ 1867{
1868 struct anon_vma *anon_vma = page_anon_vma(page);
1585 struct page *new_page; 1869 struct page *new_page;
1586 1870
1871 if (PageKsm(page)) {
1872 if (page_stable_node(page) &&
1873 !(ksm_run & KSM_RUN_UNMERGE))
1874 return page; /* no need to copy it */
1875 } else if (!anon_vma) {
1876 return page; /* no need to copy it */
1877 } else if (anon_vma->root == vma->anon_vma->root &&
1878 page->index == linear_page_index(vma, address)) {
1879 return page; /* still no need to copy it */
1880 }
1881 if (!PageUptodate(page))
1882 return page; /* let do_swap_page report the error */
1883
1587 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1884 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1588 if (new_page) { 1885 if (new_page) {
1589 copy_user_highpage(new_page, page, address, vma); 1886 copy_user_highpage(new_page, page, address, vma);
1590 1887
1591 SetPageDirty(new_page); 1888 SetPageDirty(new_page);
1592 __SetPageUptodate(new_page); 1889 __SetPageUptodate(new_page);
1593 SetPageSwapBacked(new_page);
1594 __set_page_locked(new_page); 1890 __set_page_locked(new_page);
1595
1596 if (!mlocked_vma_newpage(vma, new_page))
1597 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
1598 else
1599 add_page_to_unevictable_list(new_page);
1600 } 1891 }
1601 1892
1602 return new_page; 1893 return new_page;
@@ -1773,64 +2064,115 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage)
1773 if (stable_node) { 2064 if (stable_node) {
1774 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); 2065 VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
1775 stable_node->kpfn = page_to_pfn(newpage); 2066 stable_node->kpfn = page_to_pfn(newpage);
2067 /*
2068 * newpage->mapping was set in advance; now we need smp_wmb()
2069 * to make sure that the new stable_node->kpfn is visible
2070 * to get_ksm_page() before it can see that oldpage->mapping
2071 * has gone stale (or that PageSwapCache has been cleared).
2072 */
2073 smp_wmb();
2074 set_page_stable_node(oldpage, NULL);
1776 } 2075 }
1777} 2076}
1778#endif /* CONFIG_MIGRATION */ 2077#endif /* CONFIG_MIGRATION */
1779 2078
1780#ifdef CONFIG_MEMORY_HOTREMOVE 2079#ifdef CONFIG_MEMORY_HOTREMOVE
1781static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, 2080static int just_wait(void *word)
1782 unsigned long end_pfn)
1783{ 2081{
1784 struct rb_node *node; 2082 schedule();
2083 return 0;
2084}
1785 2085
1786 for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { 2086static void wait_while_offlining(void)
1787 struct stable_node *stable_node; 2087{
2088 while (ksm_run & KSM_RUN_OFFLINE) {
2089 mutex_unlock(&ksm_thread_mutex);
2090 wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
2091 just_wait, TASK_UNINTERRUPTIBLE);
2092 mutex_lock(&ksm_thread_mutex);
2093 }
2094}
1788 2095
1789 stable_node = rb_entry(node, struct stable_node, node); 2096static void ksm_check_stable_tree(unsigned long start_pfn,
2097 unsigned long end_pfn)
2098{
2099 struct stable_node *stable_node;
2100 struct list_head *this, *next;
2101 struct rb_node *node;
2102 int nid;
2103
2104 for (nid = 0; nid < ksm_nr_node_ids; nid++) {
2105 node = rb_first(root_stable_tree + nid);
2106 while (node) {
2107 stable_node = rb_entry(node, struct stable_node, node);
2108 if (stable_node->kpfn >= start_pfn &&
2109 stable_node->kpfn < end_pfn) {
2110 /*
2111 * Don't get_ksm_page, page has already gone:
2112 * which is why we keep kpfn instead of page*
2113 */
2114 remove_node_from_stable_tree(stable_node);
2115 node = rb_first(root_stable_tree + nid);
2116 } else
2117 node = rb_next(node);
2118 cond_resched();
2119 }
2120 }
2121 list_for_each_safe(this, next, &migrate_nodes) {
2122 stable_node = list_entry(this, struct stable_node, list);
1790 if (stable_node->kpfn >= start_pfn && 2123 if (stable_node->kpfn >= start_pfn &&
1791 stable_node->kpfn < end_pfn) 2124 stable_node->kpfn < end_pfn)
1792 return stable_node; 2125 remove_node_from_stable_tree(stable_node);
2126 cond_resched();
1793 } 2127 }
1794 return NULL;
1795} 2128}
1796 2129
1797static int ksm_memory_callback(struct notifier_block *self, 2130static int ksm_memory_callback(struct notifier_block *self,
1798 unsigned long action, void *arg) 2131 unsigned long action, void *arg)
1799{ 2132{
1800 struct memory_notify *mn = arg; 2133 struct memory_notify *mn = arg;
1801 struct stable_node *stable_node;
1802 2134
1803 switch (action) { 2135 switch (action) {
1804 case MEM_GOING_OFFLINE: 2136 case MEM_GOING_OFFLINE:
1805 /* 2137 /*
1806 * Keep it very simple for now: just lock out ksmd and 2138 * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
1807 * MADV_UNMERGEABLE while any memory is going offline. 2139 * and remove_all_stable_nodes() while memory is going offline:
1808 * mutex_lock_nested() is necessary because lockdep was alarmed 2140 * it is unsafe for them to touch the stable tree at this time.
1809 * that here we take ksm_thread_mutex inside notifier chain 2141 * But unmerge_ksm_pages(), rmap lookups and other entry points
1810 * mutex, and later take notifier chain mutex inside 2142 * which do not need the ksm_thread_mutex are all safe.
1811 * ksm_thread_mutex to unlock it. But that's safe because both
1812 * are inside mem_hotplug_mutex.
1813 */ 2143 */
1814 mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); 2144 mutex_lock(&ksm_thread_mutex);
2145 ksm_run |= KSM_RUN_OFFLINE;
2146 mutex_unlock(&ksm_thread_mutex);
1815 break; 2147 break;
1816 2148
1817 case MEM_OFFLINE: 2149 case MEM_OFFLINE:
1818 /* 2150 /*
1819 * Most of the work is done by page migration; but there might 2151 * Most of the work is done by page migration; but there might
1820 * be a few stable_nodes left over, still pointing to struct 2152 * be a few stable_nodes left over, still pointing to struct
1821 * pages which have been offlined: prune those from the tree. 2153 * pages which have been offlined: prune those from the tree,
2154 * otherwise get_ksm_page() might later try to access a
2155 * non-existent struct page.
1822 */ 2156 */
1823 while ((stable_node = ksm_check_stable_tree(mn->start_pfn, 2157 ksm_check_stable_tree(mn->start_pfn,
1824 mn->start_pfn + mn->nr_pages)) != NULL) 2158 mn->start_pfn + mn->nr_pages);
1825 remove_node_from_stable_tree(stable_node);
1826 /* fallthrough */ 2159 /* fallthrough */
1827 2160
1828 case MEM_CANCEL_OFFLINE: 2161 case MEM_CANCEL_OFFLINE:
2162 mutex_lock(&ksm_thread_mutex);
2163 ksm_run &= ~KSM_RUN_OFFLINE;
1829 mutex_unlock(&ksm_thread_mutex); 2164 mutex_unlock(&ksm_thread_mutex);
2165
2166 smp_mb(); /* wake_up_bit advises this */
2167 wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
1830 break; 2168 break;
1831 } 2169 }
1832 return NOTIFY_OK; 2170 return NOTIFY_OK;
1833} 2171}
2172#else
2173static void wait_while_offlining(void)
2174{
2175}
1834#endif /* CONFIG_MEMORY_HOTREMOVE */ 2176#endif /* CONFIG_MEMORY_HOTREMOVE */
1835 2177
1836#ifdef CONFIG_SYSFS 2178#ifdef CONFIG_SYSFS
@@ -1893,7 +2235,7 @@ KSM_ATTR(pages_to_scan);
1893static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, 2235static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
1894 char *buf) 2236 char *buf)
1895{ 2237{
1896 return sprintf(buf, "%u\n", ksm_run); 2238 return sprintf(buf, "%lu\n", ksm_run);
1897} 2239}
1898 2240
1899static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, 2241static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1916,6 +2258,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1916 */ 2258 */
1917 2259
1918 mutex_lock(&ksm_thread_mutex); 2260 mutex_lock(&ksm_thread_mutex);
2261 wait_while_offlining();
1919 if (ksm_run != flags) { 2262 if (ksm_run != flags) {
1920 ksm_run = flags; 2263 ksm_run = flags;
1921 if (flags & KSM_RUN_UNMERGE) { 2264 if (flags & KSM_RUN_UNMERGE) {
@@ -1937,6 +2280,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1937} 2280}
1938KSM_ATTR(run); 2281KSM_ATTR(run);
1939 2282
2283#ifdef CONFIG_NUMA
2284static ssize_t merge_across_nodes_show(struct kobject *kobj,
2285 struct kobj_attribute *attr, char *buf)
2286{
2287 return sprintf(buf, "%u\n", ksm_merge_across_nodes);
2288}
2289
2290static ssize_t merge_across_nodes_store(struct kobject *kobj,
2291 struct kobj_attribute *attr,
2292 const char *buf, size_t count)
2293{
2294 int err;
2295 unsigned long knob;
2296
2297 err = kstrtoul(buf, 10, &knob);
2298 if (err)
2299 return err;
2300 if (knob > 1)
2301 return -EINVAL;
2302
2303 mutex_lock(&ksm_thread_mutex);
2304 wait_while_offlining();
2305 if (ksm_merge_across_nodes != knob) {
2306 if (ksm_pages_shared || remove_all_stable_nodes())
2307 err = -EBUSY;
2308 else if (root_stable_tree == one_stable_tree) {
2309 struct rb_root *buf;
2310 /*
2311 * This is the first time that we switch away from the
2312 * default of merging across nodes: must now allocate
2313 * a buffer to hold as many roots as may be needed.
2314 * Allocate stable and unstable together:
2315 * MAXSMP NODES_SHIFT 10 will use 16kB.
2316 */
2317 buf = kcalloc(nr_node_ids + nr_node_ids,
2318 sizeof(*buf), GFP_KERNEL | __GFP_ZERO);
2319 /* Let us assume that RB_ROOT is NULL is zero */
2320 if (!buf)
2321 err = -ENOMEM;
2322 else {
2323 root_stable_tree = buf;
2324 root_unstable_tree = buf + nr_node_ids;
2325 /* Stable tree is empty but not the unstable */
2326 root_unstable_tree[0] = one_unstable_tree[0];
2327 }
2328 }
2329 if (!err) {
2330 ksm_merge_across_nodes = knob;
2331 ksm_nr_node_ids = knob ? 1 : nr_node_ids;
2332 }
2333 }
2334 mutex_unlock(&ksm_thread_mutex);
2335
2336 return err ? err : count;
2337}
2338KSM_ATTR(merge_across_nodes);
2339#endif
2340
1940static ssize_t pages_shared_show(struct kobject *kobj, 2341static ssize_t pages_shared_show(struct kobject *kobj,
1941 struct kobj_attribute *attr, char *buf) 2342 struct kobj_attribute *attr, char *buf)
1942{ 2343{
@@ -1991,6 +2392,9 @@ static struct attribute *ksm_attrs[] = {
1991 &pages_unshared_attr.attr, 2392 &pages_unshared_attr.attr,
1992 &pages_volatile_attr.attr, 2393 &pages_volatile_attr.attr,
1993 &full_scans_attr.attr, 2394 &full_scans_attr.attr,
2395#ifdef CONFIG_NUMA
2396 &merge_across_nodes_attr.attr,
2397#endif
1994 NULL, 2398 NULL,
1995}; 2399};
1996 2400
@@ -2029,10 +2433,7 @@ static int __init ksm_init(void)
2029#endif /* CONFIG_SYSFS */ 2433#endif /* CONFIG_SYSFS */
2030 2434
2031#ifdef CONFIG_MEMORY_HOTREMOVE 2435#ifdef CONFIG_MEMORY_HOTREMOVE
2032 /* 2436 /* There is no significance to this priority 100 */
2033 * Choose a high priority since the callback takes ksm_thread_mutex:
2034 * later callbacks could only be taking locks which nest within that.
2035 */
2036 hotplug_memory_notifier(ksm_memory_callback, 100); 2437 hotplug_memory_notifier(ksm_memory_callback, 100);
2037#endif 2438#endif
2038 return 0; 2439 return 0;
diff --git a/mm/madvise.c b/mm/madvise.c
index 03dfa5c7adb3..c58c94b56c3d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -16,6 +16,9 @@
16#include <linux/ksm.h> 16#include <linux/ksm.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/file.h> 18#include <linux/file.h>
19#include <linux/blkdev.h>
20#include <linux/swap.h>
21#include <linux/swapops.h>
19 22
20/* 23/*
21 * Any behaviour which results in changes to the vma->vm_flags needs to 24 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -131,6 +134,84 @@ out:
131 return error; 134 return error;
132} 135}
133 136
137#ifdef CONFIG_SWAP
138static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
139 unsigned long end, struct mm_walk *walk)
140{
141 pte_t *orig_pte;
142 struct vm_area_struct *vma = walk->private;
143 unsigned long index;
144
145 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
146 return 0;
147
148 for (index = start; index != end; index += PAGE_SIZE) {
149 pte_t pte;
150 swp_entry_t entry;
151 struct page *page;
152 spinlock_t *ptl;
153
154 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
155 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
156 pte_unmap_unlock(orig_pte, ptl);
157
158 if (pte_present(pte) || pte_none(pte) || pte_file(pte))
159 continue;
160 entry = pte_to_swp_entry(pte);
161 if (unlikely(non_swap_entry(entry)))
162 continue;
163
164 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
165 vma, index);
166 if (page)
167 page_cache_release(page);
168 }
169
170 return 0;
171}
172
173static void force_swapin_readahead(struct vm_area_struct *vma,
174 unsigned long start, unsigned long end)
175{
176 struct mm_walk walk = {
177 .mm = vma->vm_mm,
178 .pmd_entry = swapin_walk_pmd_entry,
179 .private = vma,
180 };
181
182 walk_page_range(start, end, &walk);
183
184 lru_add_drain(); /* Push any new pages onto the LRU now */
185}
186
187static void force_shm_swapin_readahead(struct vm_area_struct *vma,
188 unsigned long start, unsigned long end,
189 struct address_space *mapping)
190{
191 pgoff_t index;
192 struct page *page;
193 swp_entry_t swap;
194
195 for (; start < end; start += PAGE_SIZE) {
196 index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
197
198 page = find_get_page(mapping, index);
199 if (!radix_tree_exceptional_entry(page)) {
200 if (page)
201 page_cache_release(page);
202 continue;
203 }
204 swap = radix_to_swp_entry(page);
205 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
206 NULL, 0);
207 if (page)
208 page_cache_release(page);
209 }
210
211 lru_add_drain(); /* Push any new pages onto the LRU now */
212}
213#endif /* CONFIG_SWAP */
214
134/* 215/*
135 * Schedule all required I/O operations. Do not wait for completion. 216 * Schedule all required I/O operations. Do not wait for completion.
136 */ 217 */
@@ -140,6 +221,18 @@ static long madvise_willneed(struct vm_area_struct * vma,
140{ 221{
141 struct file *file = vma->vm_file; 222 struct file *file = vma->vm_file;
142 223
224#ifdef CONFIG_SWAP
225 if (!file || mapping_cap_swap_backed(file->f_mapping)) {
226 *prev = vma;
227 if (!file)
228 force_swapin_readahead(vma, start, end);
229 else
230 force_shm_swapin_readahead(vma, start, end,
231 file->f_mapping);
232 return 0;
233 }
234#endif
235
143 if (!file) 236 if (!file)
144 return -EBADF; 237 return -EBADF;
145 238
@@ -371,6 +464,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
371 int error = -EINVAL; 464 int error = -EINVAL;
372 int write; 465 int write;
373 size_t len; 466 size_t len;
467 struct blk_plug plug;
374 468
375#ifdef CONFIG_MEMORY_FAILURE 469#ifdef CONFIG_MEMORY_FAILURE
376 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) 470 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
@@ -410,18 +504,19 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
410 if (vma && start > vma->vm_start) 504 if (vma && start > vma->vm_start)
411 prev = vma; 505 prev = vma;
412 506
507 blk_start_plug(&plug);
413 for (;;) { 508 for (;;) {
414 /* Still start < end. */ 509 /* Still start < end. */
415 error = -ENOMEM; 510 error = -ENOMEM;
416 if (!vma) 511 if (!vma)
417 goto out; 512 goto out_plug;
418 513
419 /* Here start < (end|vma->vm_end). */ 514 /* Here start < (end|vma->vm_end). */
420 if (start < vma->vm_start) { 515 if (start < vma->vm_start) {
421 unmapped_error = -ENOMEM; 516 unmapped_error = -ENOMEM;
422 start = vma->vm_start; 517 start = vma->vm_start;
423 if (start >= end) 518 if (start >= end)
424 goto out; 519 goto out_plug;
425 } 520 }
426 521
427 /* Here vma->vm_start <= start < (end|vma->vm_end) */ 522 /* Here vma->vm_start <= start < (end|vma->vm_end) */
@@ -432,18 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
432 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ 527 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
433 error = madvise_vma(vma, &prev, start, tmp, behavior); 528 error = madvise_vma(vma, &prev, start, tmp, behavior);
434 if (error) 529 if (error)
435 goto out; 530 goto out_plug;
436 start = tmp; 531 start = tmp;
437 if (prev && start < prev->vm_end) 532 if (prev && start < prev->vm_end)
438 start = prev->vm_end; 533 start = prev->vm_end;
439 error = unmapped_error; 534 error = unmapped_error;
440 if (start >= end) 535 if (start >= end)
441 goto out; 536 goto out_plug;
442 if (prev) 537 if (prev)
443 vma = prev->vm_next; 538 vma = prev->vm_next;
444 else /* madvise_remove dropped mmap_sem */ 539 else /* madvise_remove dropped mmap_sem */
445 vma = find_vma(current->mm, start); 540 vma = find_vma(current->mm, start);
446 } 541 }
542out_plug:
543 blk_finish_plug(&plug);
447out: 544out:
448 if (write) 545 if (write)
449 up_write(&current->mm->mmap_sem); 546 up_write(&current->mm->mmap_sem);
diff --git a/mm/memblock.c b/mm/memblock.c
index b8d9147e5c08..1bcd9b970564 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -92,9 +92,58 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
92 * 92 *
93 * Find @size free area aligned to @align in the specified range and node. 93 * Find @size free area aligned to @align in the specified range and node.
94 * 94 *
95 * If we have CONFIG_HAVE_MEMBLOCK_NODE_MAP defined, we need to check if the
96 * memory we found if not in hotpluggable ranges.
97 *
95 * RETURNS: 98 * RETURNS:
96 * Found address on success, %0 on failure. 99 * Found address on success, %0 on failure.
97 */ 100 */
101#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
102phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
103 phys_addr_t end, phys_addr_t size,
104 phys_addr_t align, int nid)
105{
106 phys_addr_t this_start, this_end, cand;
107 u64 i;
108 int curr = movablemem_map.nr_map - 1;
109
110 /* pump up @end */
111 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
112 end = memblock.current_limit;
113
114 /* avoid allocating the first page */
115 start = max_t(phys_addr_t, start, PAGE_SIZE);
116 end = max(start, end);
117
118 for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
119 this_start = clamp(this_start, start, end);
120 this_end = clamp(this_end, start, end);
121
122restart:
123 if (this_end <= this_start || this_end < size)
124 continue;
125
126 for (; curr >= 0; curr--) {
127 if ((movablemem_map.map[curr].start_pfn << PAGE_SHIFT)
128 < this_end)
129 break;
130 }
131
132 cand = round_down(this_end - size, align);
133 if (curr >= 0 &&
134 cand < movablemem_map.map[curr].end_pfn << PAGE_SHIFT) {
135 this_end = movablemem_map.map[curr].start_pfn
136 << PAGE_SHIFT;
137 goto restart;
138 }
139
140 if (cand >= this_start)
141 return cand;
142 }
143
144 return 0;
145}
146#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
98phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, 147phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
99 phys_addr_t end, phys_addr_t size, 148 phys_addr_t end, phys_addr_t size,
100 phys_addr_t align, int nid) 149 phys_addr_t align, int nid)
@@ -123,6 +172,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
123 } 172 }
124 return 0; 173 return 0;
125} 174}
175#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
126 176
127/** 177/**
128 * memblock_find_in_range - find free area in given range 178 * memblock_find_in_range - find free area in given range
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fbb60b103e64..53b8201b31eb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -120,6 +120,14 @@ static const char * const mem_cgroup_events_names[] = {
120 "pgmajfault", 120 "pgmajfault",
121}; 121};
122 122
123static const char * const mem_cgroup_lru_names[] = {
124 "inactive_anon",
125 "active_anon",
126 "inactive_file",
127 "active_file",
128 "unevictable",
129};
130
123/* 131/*
124 * Per memcg event counter is incremented at every pagein/pageout. With THP, 132 * Per memcg event counter is incremented at every pagein/pageout. With THP,
125 * it will be incremated by the number of pages. This counter is used for 133 * it will be incremated by the number of pages. This counter is used for
@@ -172,7 +180,7 @@ struct mem_cgroup_per_node {
172}; 180};
173 181
174struct mem_cgroup_lru_info { 182struct mem_cgroup_lru_info {
175 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 183 struct mem_cgroup_per_node *nodeinfo[0];
176}; 184};
177 185
178/* 186/*
@@ -276,17 +284,6 @@ struct mem_cgroup {
276 */ 284 */
277 struct res_counter kmem; 285 struct res_counter kmem;
278 /* 286 /*
279 * Per cgroup active and inactive list, similar to the
280 * per zone LRU lists.
281 */
282 struct mem_cgroup_lru_info info;
283 int last_scanned_node;
284#if MAX_NUMNODES > 1
285 nodemask_t scan_nodes;
286 atomic_t numainfo_events;
287 atomic_t numainfo_updating;
288#endif
289 /*
290 * Should the accounting and control be hierarchical, per subtree? 287 * Should the accounting and control be hierarchical, per subtree?
291 */ 288 */
292 bool use_hierarchy; 289 bool use_hierarchy;
@@ -349,8 +346,29 @@ struct mem_cgroup {
349 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 346 /* Index in the kmem_cache->memcg_params->memcg_caches array */
350 int kmemcg_id; 347 int kmemcg_id;
351#endif 348#endif
349
350 int last_scanned_node;
351#if MAX_NUMNODES > 1
352 nodemask_t scan_nodes;
353 atomic_t numainfo_events;
354 atomic_t numainfo_updating;
355#endif
356 /*
357 * Per cgroup active and inactive list, similar to the
358 * per zone LRU lists.
359 *
360 * WARNING: This has to be the last element of the struct. Don't
361 * add new fields after this point.
362 */
363 struct mem_cgroup_lru_info info;
352}; 364};
353 365
366static size_t memcg_size(void)
367{
368 return sizeof(struct mem_cgroup) +
369 nr_node_ids * sizeof(struct mem_cgroup_per_node);
370}
371
354/* internal only representation about the status of kmem accounting. */ 372/* internal only representation about the status of kmem accounting. */
355enum { 373enum {
356 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ 374 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
@@ -398,8 +416,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
398 416
399/* Stuffs for move charges at task migration. */ 417/* Stuffs for move charges at task migration. */
400/* 418/*
401 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 419 * Types of charges to be moved. "move_charge_at_immitgrate" and
402 * left-shifted bitmap of these types. 420 * "immigrate_flags" are treated as a left-shifted bitmap of these types.
403 */ 421 */
404enum move_type { 422enum move_type {
405 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 423 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
@@ -412,6 +430,7 @@ static struct move_charge_struct {
412 spinlock_t lock; /* for from, to */ 430 spinlock_t lock; /* for from, to */
413 struct mem_cgroup *from; 431 struct mem_cgroup *from;
414 struct mem_cgroup *to; 432 struct mem_cgroup *to;
433 unsigned long immigrate_flags;
415 unsigned long precharge; 434 unsigned long precharge;
416 unsigned long moved_charge; 435 unsigned long moved_charge;
417 unsigned long moved_swap; 436 unsigned long moved_swap;
@@ -424,14 +443,12 @@ static struct move_charge_struct {
424 443
425static bool move_anon(void) 444static bool move_anon(void)
426{ 445{
427 return test_bit(MOVE_CHARGE_TYPE_ANON, 446 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
428 &mc.to->move_charge_at_immigrate);
429} 447}
430 448
431static bool move_file(void) 449static bool move_file(void)
432{ 450{
433 return test_bit(MOVE_CHARGE_TYPE_FILE, 451 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
434 &mc.to->move_charge_at_immigrate);
435} 452}
436 453
437/* 454/*
@@ -471,6 +488,13 @@ enum res_type {
471#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 488#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
472#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 489#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
473 490
491/*
492 * The memcg_create_mutex will be held whenever a new cgroup is created.
493 * As a consequence, any change that needs to protect against new child cgroups
494 * appearing has to hold it as well.
495 */
496static DEFINE_MUTEX(memcg_create_mutex);
497
474static void mem_cgroup_get(struct mem_cgroup *memcg); 498static void mem_cgroup_get(struct mem_cgroup *memcg);
475static void mem_cgroup_put(struct mem_cgroup *memcg); 499static void mem_cgroup_put(struct mem_cgroup *memcg);
476 500
@@ -627,6 +651,7 @@ static void drain_all_stock_async(struct mem_cgroup *memcg);
627static struct mem_cgroup_per_zone * 651static struct mem_cgroup_per_zone *
628mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) 652mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
629{ 653{
654 VM_BUG_ON((unsigned)nid >= nr_node_ids);
630 return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; 655 return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
631} 656}
632 657
@@ -1371,17 +1396,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1371 return inactive * inactive_ratio < active; 1396 return inactive * inactive_ratio < active;
1372} 1397}
1373 1398
1374int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
1375{
1376 unsigned long active;
1377 unsigned long inactive;
1378
1379 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
1380 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
1381
1382 return (active > inactive);
1383}
1384
1385#define mem_cgroup_from_res_counter(counter, member) \ 1399#define mem_cgroup_from_res_counter(counter, member) \
1386 container_of(counter, struct mem_cgroup, member) 1400 container_of(counter, struct mem_cgroup, member)
1387 1401
@@ -1524,8 +1538,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1524 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1538 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1525} 1539}
1526 1540
1541#define K(x) ((x) << (PAGE_SHIFT-10))
1527/** 1542/**
1528 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1543 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1529 * @memcg: The memory cgroup that went over limit 1544 * @memcg: The memory cgroup that went over limit
1530 * @p: Task that is going to be killed 1545 * @p: Task that is going to be killed
1531 * 1546 *
@@ -1543,8 +1558,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1543 */ 1558 */
1544 static char memcg_name[PATH_MAX]; 1559 static char memcg_name[PATH_MAX];
1545 int ret; 1560 int ret;
1561 struct mem_cgroup *iter;
1562 unsigned int i;
1546 1563
1547 if (!memcg || !p) 1564 if (!p)
1548 return; 1565 return;
1549 1566
1550 rcu_read_lock(); 1567 rcu_read_lock();
@@ -1563,7 +1580,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1563 } 1580 }
1564 rcu_read_unlock(); 1581 rcu_read_unlock();
1565 1582
1566 printk(KERN_INFO "Task in %s killed", memcg_name); 1583 pr_info("Task in %s killed", memcg_name);
1567 1584
1568 rcu_read_lock(); 1585 rcu_read_lock();
1569 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1586 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
@@ -1576,22 +1593,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1576 /* 1593 /*
1577 * Continues from above, so we don't need an KERN_ level 1594 * Continues from above, so we don't need an KERN_ level
1578 */ 1595 */
1579 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1596 pr_cont(" as a result of limit of %s\n", memcg_name);
1580done: 1597done:
1581 1598
1582 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1599 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
1583 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1600 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1584 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1601 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1585 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1602 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1586 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1603 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
1587 "failcnt %llu\n",
1588 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1604 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1589 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1605 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1590 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1606 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1591 printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1607 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1592 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, 1608 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1593 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, 1609 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1594 res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); 1610 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1611
1612 for_each_mem_cgroup_tree(iter, memcg) {
1613 pr_info("Memory cgroup stats");
1614
1615 rcu_read_lock();
1616 ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
1617 if (!ret)
1618 pr_cont(" for %s", memcg_name);
1619 rcu_read_unlock();
1620 pr_cont(":");
1621
1622 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1623 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1624 continue;
1625 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1626 K(mem_cgroup_read_stat(iter, i)));
1627 }
1628
1629 for (i = 0; i < NR_LRU_LISTS; i++)
1630 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1631 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1632
1633 pr_cont("\n");
1634 }
1595} 1635}
1596 1636
1597/* 1637/*
@@ -2256,6 +2296,17 @@ static void drain_local_stock(struct work_struct *dummy)
2256 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2296 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2257} 2297}
2258 2298
2299static void __init memcg_stock_init(void)
2300{
2301 int cpu;
2302
2303 for_each_possible_cpu(cpu) {
2304 struct memcg_stock_pcp *stock =
2305 &per_cpu(memcg_stock, cpu);
2306 INIT_WORK(&stock->work, drain_local_stock);
2307 }
2308}
2309
2259/* 2310/*
2260 * Cache charges(val) which is from res_counter, to local per_cpu area. 2311 * Cache charges(val) which is from res_counter, to local per_cpu area.
2261 * This will be consumed by consume_stock() function, later. 2312 * This will be consumed by consume_stock() function, later.
@@ -4391,8 +4442,8 @@ void mem_cgroup_print_bad_page(struct page *page)
4391 4442
4392 pc = lookup_page_cgroup_used(page); 4443 pc = lookup_page_cgroup_used(page);
4393 if (pc) { 4444 if (pc) {
4394 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 4445 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
4395 pc, pc->flags, pc->mem_cgroup); 4446 pc, pc->flags, pc->mem_cgroup);
4396 } 4447 }
4397} 4448}
4398#endif 4449#endif
@@ -4719,6 +4770,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4719} 4770}
4720 4771
4721/* 4772/*
4773 * This mainly exists for tests during the setting of set of use_hierarchy.
4774 * Since this is the very setting we are changing, the current hierarchy value
4775 * is meaningless
4776 */
4777static inline bool __memcg_has_children(struct mem_cgroup *memcg)
4778{
4779 struct cgroup *pos;
4780
4781 /* bounce at first found */
4782 cgroup_for_each_child(pos, memcg->css.cgroup)
4783 return true;
4784 return false;
4785}
4786
4787/*
4788 * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
4789 * to be already dead (as in mem_cgroup_force_empty, for instance). This is
4790 * from mem_cgroup_count_children(), in the sense that we don't really care how
4791 * many children we have; we only need to know if we have any. It also counts
4792 * any memcg without hierarchy as infertile.
4793 */
4794static inline bool memcg_has_children(struct mem_cgroup *memcg)
4795{
4796 return memcg->use_hierarchy && __memcg_has_children(memcg);
4797}
4798
4799/*
4722 * Reclaims as many pages from the given memcg as possible and moves 4800 * Reclaims as many pages from the given memcg as possible and moves
4723 * the rest to the parent. 4801 * the rest to the parent.
4724 * 4802 *
@@ -4788,7 +4866,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
4788 if (parent) 4866 if (parent)
4789 parent_memcg = mem_cgroup_from_cont(parent); 4867 parent_memcg = mem_cgroup_from_cont(parent);
4790 4868
4791 cgroup_lock(); 4869 mutex_lock(&memcg_create_mutex);
4792 4870
4793 if (memcg->use_hierarchy == val) 4871 if (memcg->use_hierarchy == val)
4794 goto out; 4872 goto out;
@@ -4803,7 +4881,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
4803 */ 4881 */
4804 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 4882 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
4805 (val == 1 || val == 0)) { 4883 (val == 1 || val == 0)) {
4806 if (list_empty(&cont->children)) 4884 if (!__memcg_has_children(memcg))
4807 memcg->use_hierarchy = val; 4885 memcg->use_hierarchy = val;
4808 else 4886 else
4809 retval = -EBUSY; 4887 retval = -EBUSY;
@@ -4811,7 +4889,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
4811 retval = -EINVAL; 4889 retval = -EINVAL;
4812 4890
4813out: 4891out:
4814 cgroup_unlock(); 4892 mutex_unlock(&memcg_create_mutex);
4815 4893
4816 return retval; 4894 return retval;
4817} 4895}
@@ -4896,8 +4974,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4896{ 4974{
4897 int ret = -EINVAL; 4975 int ret = -EINVAL;
4898#ifdef CONFIG_MEMCG_KMEM 4976#ifdef CONFIG_MEMCG_KMEM
4899 bool must_inc_static_branch = false;
4900
4901 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4977 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4902 /* 4978 /*
4903 * For simplicity, we won't allow this to be disabled. It also can't 4979 * For simplicity, we won't allow this to be disabled. It also can't
@@ -4910,18 +4986,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4910 * 4986 *
4911 * After it first became limited, changes in the value of the limit are 4987 * After it first became limited, changes in the value of the limit are
4912 * of course permitted. 4988 * of course permitted.
4913 *
4914 * Taking the cgroup_lock is really offensive, but it is so far the only
4915 * way to guarantee that no children will appear. There are plenty of
4916 * other offenders, and they should all go away. Fine grained locking
4917 * is probably the way to go here. When we are fully hierarchical, we
4918 * can also get rid of the use_hierarchy check.
4919 */ 4989 */
4920 cgroup_lock(); 4990 mutex_lock(&memcg_create_mutex);
4921 mutex_lock(&set_limit_mutex); 4991 mutex_lock(&set_limit_mutex);
4922 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { 4992 if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
4923 if (cgroup_task_count(cont) || (memcg->use_hierarchy && 4993 if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
4924 !list_empty(&cont->children))) {
4925 ret = -EBUSY; 4994 ret = -EBUSY;
4926 goto out; 4995 goto out;
4927 } 4996 }
@@ -4933,7 +5002,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4933 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); 5002 res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
4934 goto out; 5003 goto out;
4935 } 5004 }
4936 must_inc_static_branch = true; 5005 static_key_slow_inc(&memcg_kmem_enabled_key);
5006 /*
5007 * setting the active bit after the inc will guarantee no one
5008 * starts accounting before all call sites are patched
5009 */
5010 memcg_kmem_set_active(memcg);
5011
4937 /* 5012 /*
4938 * kmem charges can outlive the cgroup. In the case of slab 5013 * kmem charges can outlive the cgroup. In the case of slab
4939 * pages, for instance, a page contain objects from various 5014 * pages, for instance, a page contain objects from various
@@ -4945,32 +5020,12 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
4945 ret = res_counter_set_limit(&memcg->kmem, val); 5020 ret = res_counter_set_limit(&memcg->kmem, val);
4946out: 5021out:
4947 mutex_unlock(&set_limit_mutex); 5022 mutex_unlock(&set_limit_mutex);
4948 cgroup_unlock(); 5023 mutex_unlock(&memcg_create_mutex);
4949
4950 /*
4951 * We are by now familiar with the fact that we can't inc the static
4952 * branch inside cgroup_lock. See disarm functions for details. A
4953 * worker here is overkill, but also wrong: After the limit is set, we
4954 * must start accounting right away. Since this operation can't fail,
4955 * we can safely defer it to here - no rollback will be needed.
4956 *
4957 * The boolean used to control this is also safe, because
4958 * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
4959 * able to set it to true;
4960 */
4961 if (must_inc_static_branch) {
4962 static_key_slow_inc(&memcg_kmem_enabled_key);
4963 /*
4964 * setting the active bit after the inc will guarantee no one
4965 * starts accounting before all call sites are patched
4966 */
4967 memcg_kmem_set_active(memcg);
4968 }
4969
4970#endif 5024#endif
4971 return ret; 5025 return ret;
4972} 5026}
4973 5027
5028#ifdef CONFIG_MEMCG_KMEM
4974static int memcg_propagate_kmem(struct mem_cgroup *memcg) 5029static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4975{ 5030{
4976 int ret = 0; 5031 int ret = 0;
@@ -4979,7 +5034,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4979 goto out; 5034 goto out;
4980 5035
4981 memcg->kmem_account_flags = parent->kmem_account_flags; 5036 memcg->kmem_account_flags = parent->kmem_account_flags;
4982#ifdef CONFIG_MEMCG_KMEM
4983 /* 5037 /*
4984 * When that happen, we need to disable the static branch only on those 5038 * When that happen, we need to disable the static branch only on those
4985 * memcgs that enabled it. To achieve this, we would be forced to 5039 * memcgs that enabled it. To achieve this, we would be forced to
@@ -5005,10 +5059,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5005 mutex_lock(&set_limit_mutex); 5059 mutex_lock(&set_limit_mutex);
5006 ret = memcg_update_cache_sizes(memcg); 5060 ret = memcg_update_cache_sizes(memcg);
5007 mutex_unlock(&set_limit_mutex); 5061 mutex_unlock(&set_limit_mutex);
5008#endif
5009out: 5062out:
5010 return ret; 5063 return ret;
5011} 5064}
5065#endif /* CONFIG_MEMCG_KMEM */
5012 5066
5013/* 5067/*
5014 * The user of this function is... 5068 * The user of this function is...
@@ -5148,15 +5202,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
5148 5202
5149 if (val >= (1 << NR_MOVE_TYPE)) 5203 if (val >= (1 << NR_MOVE_TYPE))
5150 return -EINVAL; 5204 return -EINVAL;
5205
5151 /* 5206 /*
5152 * We check this value several times in both in can_attach() and 5207 * No kind of locking is needed in here, because ->can_attach() will
5153 * attach(), so we need cgroup lock to prevent this value from being 5208 * check this value once in the beginning of the process, and then carry
5154 * inconsistent. 5209 * on with stale data. This means that changes to this value will only
5210 * affect task migrations starting after the change.
5155 */ 5211 */
5156 cgroup_lock();
5157 memcg->move_charge_at_immigrate = val; 5212 memcg->move_charge_at_immigrate = val;
5158 cgroup_unlock();
5159
5160 return 0; 5213 return 0;
5161} 5214}
5162#else 5215#else
@@ -5214,14 +5267,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
5214} 5267}
5215#endif /* CONFIG_NUMA */ 5268#endif /* CONFIG_NUMA */
5216 5269
5217static const char * const mem_cgroup_lru_names[] = {
5218 "inactive_anon",
5219 "active_anon",
5220 "inactive_file",
5221 "active_file",
5222 "unevictable",
5223};
5224
5225static inline void mem_cgroup_lru_names_not_uptodate(void) 5270static inline void mem_cgroup_lru_names_not_uptodate(void)
5226{ 5271{
5227 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5272 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -5335,18 +5380,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
5335 5380
5336 parent = mem_cgroup_from_cont(cgrp->parent); 5381 parent = mem_cgroup_from_cont(cgrp->parent);
5337 5382
5338 cgroup_lock(); 5383 mutex_lock(&memcg_create_mutex);
5339 5384
5340 /* If under hierarchy, only empty-root can set this value */ 5385 /* If under hierarchy, only empty-root can set this value */
5341 if ((parent->use_hierarchy) || 5386 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5342 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 5387 mutex_unlock(&memcg_create_mutex);
5343 cgroup_unlock();
5344 return -EINVAL; 5388 return -EINVAL;
5345 } 5389 }
5346 5390
5347 memcg->swappiness = val; 5391 memcg->swappiness = val;
5348 5392
5349 cgroup_unlock(); 5393 mutex_unlock(&memcg_create_mutex);
5350 5394
5351 return 0; 5395 return 0;
5352} 5396}
@@ -5672,17 +5716,16 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
5672 5716
5673 parent = mem_cgroup_from_cont(cgrp->parent); 5717 parent = mem_cgroup_from_cont(cgrp->parent);
5674 5718
5675 cgroup_lock(); 5719 mutex_lock(&memcg_create_mutex);
5676 /* oom-kill-disable is a flag for subhierarchy. */ 5720 /* oom-kill-disable is a flag for subhierarchy. */
5677 if ((parent->use_hierarchy) || 5721 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5678 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 5722 mutex_unlock(&memcg_create_mutex);
5679 cgroup_unlock();
5680 return -EINVAL; 5723 return -EINVAL;
5681 } 5724 }
5682 memcg->oom_kill_disable = val; 5725 memcg->oom_kill_disable = val;
5683 if (!val) 5726 if (!val)
5684 memcg_oom_recover(memcg); 5727 memcg_oom_recover(memcg);
5685 cgroup_unlock(); 5728 mutex_unlock(&memcg_create_mutex);
5686 return 0; 5729 return 0;
5687} 5730}
5688 5731
@@ -5797,33 +5840,6 @@ static struct cftype mem_cgroup_files[] = {
5797 .read_seq_string = memcg_numa_stat_show, 5840 .read_seq_string = memcg_numa_stat_show,
5798 }, 5841 },
5799#endif 5842#endif
5800#ifdef CONFIG_MEMCG_SWAP
5801 {
5802 .name = "memsw.usage_in_bytes",
5803 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5804 .read = mem_cgroup_read,
5805 .register_event = mem_cgroup_usage_register_event,
5806 .unregister_event = mem_cgroup_usage_unregister_event,
5807 },
5808 {
5809 .name = "memsw.max_usage_in_bytes",
5810 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5811 .trigger = mem_cgroup_reset,
5812 .read = mem_cgroup_read,
5813 },
5814 {
5815 .name = "memsw.limit_in_bytes",
5816 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5817 .write_string = mem_cgroup_write,
5818 .read = mem_cgroup_read,
5819 },
5820 {
5821 .name = "memsw.failcnt",
5822 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5823 .trigger = mem_cgroup_reset,
5824 .read = mem_cgroup_read,
5825 },
5826#endif
5827#ifdef CONFIG_MEMCG_KMEM 5843#ifdef CONFIG_MEMCG_KMEM
5828 { 5844 {
5829 .name = "kmem.limit_in_bytes", 5845 .name = "kmem.limit_in_bytes",
@@ -5858,6 +5874,36 @@ static struct cftype mem_cgroup_files[] = {
5858 { }, /* terminate */ 5874 { }, /* terminate */
5859}; 5875};
5860 5876
5877#ifdef CONFIG_MEMCG_SWAP
5878static struct cftype memsw_cgroup_files[] = {
5879 {
5880 .name = "memsw.usage_in_bytes",
5881 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5882 .read = mem_cgroup_read,
5883 .register_event = mem_cgroup_usage_register_event,
5884 .unregister_event = mem_cgroup_usage_unregister_event,
5885 },
5886 {
5887 .name = "memsw.max_usage_in_bytes",
5888 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5889 .trigger = mem_cgroup_reset,
5890 .read = mem_cgroup_read,
5891 },
5892 {
5893 .name = "memsw.limit_in_bytes",
5894 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5895 .write_string = mem_cgroup_write,
5896 .read = mem_cgroup_read,
5897 },
5898 {
5899 .name = "memsw.failcnt",
5900 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5901 .trigger = mem_cgroup_reset,
5902 .read = mem_cgroup_read,
5903 },
5904 { }, /* terminate */
5905};
5906#endif
5861static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 5907static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5862{ 5908{
5863 struct mem_cgroup_per_node *pn; 5909 struct mem_cgroup_per_node *pn;
@@ -5896,9 +5942,9 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5896static struct mem_cgroup *mem_cgroup_alloc(void) 5942static struct mem_cgroup *mem_cgroup_alloc(void)
5897{ 5943{
5898 struct mem_cgroup *memcg; 5944 struct mem_cgroup *memcg;
5899 int size = sizeof(struct mem_cgroup); 5945 size_t size = memcg_size();
5900 5946
5901 /* Can be very big if MAX_NUMNODES is very big */ 5947 /* Can be very big if nr_node_ids is very big */
5902 if (size < PAGE_SIZE) 5948 if (size < PAGE_SIZE)
5903 memcg = kzalloc(size, GFP_KERNEL); 5949 memcg = kzalloc(size, GFP_KERNEL);
5904 else 5950 else
@@ -5935,7 +5981,7 @@ out_free:
5935static void __mem_cgroup_free(struct mem_cgroup *memcg) 5981static void __mem_cgroup_free(struct mem_cgroup *memcg)
5936{ 5982{
5937 int node; 5983 int node;
5938 int size = sizeof(struct mem_cgroup); 5984 size_t size = memcg_size();
5939 5985
5940 mem_cgroup_remove_from_trees(memcg); 5986 mem_cgroup_remove_from_trees(memcg);
5941 free_css_id(&mem_cgroup_subsys, &memcg->css); 5987 free_css_id(&mem_cgroup_subsys, &memcg->css);
@@ -6017,19 +6063,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6017} 6063}
6018EXPORT_SYMBOL(parent_mem_cgroup); 6064EXPORT_SYMBOL(parent_mem_cgroup);
6019 6065
6020#ifdef CONFIG_MEMCG_SWAP 6066static void __init mem_cgroup_soft_limit_tree_init(void)
6021static void __init enable_swap_cgroup(void)
6022{
6023 if (!mem_cgroup_disabled() && really_do_swap_account)
6024 do_swap_account = 1;
6025}
6026#else
6027static void __init enable_swap_cgroup(void)
6028{
6029}
6030#endif
6031
6032static int mem_cgroup_soft_limit_tree_init(void)
6033{ 6067{
6034 struct mem_cgroup_tree_per_node *rtpn; 6068 struct mem_cgroup_tree_per_node *rtpn;
6035 struct mem_cgroup_tree_per_zone *rtpz; 6069 struct mem_cgroup_tree_per_zone *rtpz;
@@ -6040,8 +6074,7 @@ static int mem_cgroup_soft_limit_tree_init(void)
6040 if (!node_state(node, N_NORMAL_MEMORY)) 6074 if (!node_state(node, N_NORMAL_MEMORY))
6041 tmp = -1; 6075 tmp = -1;
6042 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 6076 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6043 if (!rtpn) 6077 BUG_ON(!rtpn);
6044 goto err_cleanup;
6045 6078
6046 soft_limit_tree.rb_tree_per_node[node] = rtpn; 6079 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6047 6080
@@ -6051,23 +6084,12 @@ static int mem_cgroup_soft_limit_tree_init(void)
6051 spin_lock_init(&rtpz->lock); 6084 spin_lock_init(&rtpz->lock);
6052 } 6085 }
6053 } 6086 }
6054 return 0;
6055
6056err_cleanup:
6057 for_each_node(node) {
6058 if (!soft_limit_tree.rb_tree_per_node[node])
6059 break;
6060 kfree(soft_limit_tree.rb_tree_per_node[node]);
6061 soft_limit_tree.rb_tree_per_node[node] = NULL;
6062 }
6063 return 1;
6064
6065} 6087}
6066 6088
6067static struct cgroup_subsys_state * __ref 6089static struct cgroup_subsys_state * __ref
6068mem_cgroup_css_alloc(struct cgroup *cont) 6090mem_cgroup_css_alloc(struct cgroup *cont)
6069{ 6091{
6070 struct mem_cgroup *memcg, *parent; 6092 struct mem_cgroup *memcg;
6071 long error = -ENOMEM; 6093 long error = -ENOMEM;
6072 int node; 6094 int node;
6073 6095
@@ -6081,24 +6103,44 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6081 6103
6082 /* root ? */ 6104 /* root ? */
6083 if (cont->parent == NULL) { 6105 if (cont->parent == NULL) {
6084 int cpu;
6085 enable_swap_cgroup();
6086 parent = NULL;
6087 if (mem_cgroup_soft_limit_tree_init())
6088 goto free_out;
6089 root_mem_cgroup = memcg; 6106 root_mem_cgroup = memcg;
6090 for_each_possible_cpu(cpu) { 6107 res_counter_init(&memcg->res, NULL);
6091 struct memcg_stock_pcp *stock = 6108 res_counter_init(&memcg->memsw, NULL);
6092 &per_cpu(memcg_stock, cpu); 6109 res_counter_init(&memcg->kmem, NULL);
6093 INIT_WORK(&stock->work, drain_local_stock);
6094 }
6095 } else {
6096 parent = mem_cgroup_from_cont(cont->parent);
6097 memcg->use_hierarchy = parent->use_hierarchy;
6098 memcg->oom_kill_disable = parent->oom_kill_disable;
6099 } 6110 }
6100 6111
6101 if (parent && parent->use_hierarchy) { 6112 memcg->last_scanned_node = MAX_NUMNODES;
6113 INIT_LIST_HEAD(&memcg->oom_notify);
6114 atomic_set(&memcg->refcnt, 1);
6115 memcg->move_charge_at_immigrate = 0;
6116 mutex_init(&memcg->thresholds_lock);
6117 spin_lock_init(&memcg->move_lock);
6118
6119 return &memcg->css;
6120
6121free_out:
6122 __mem_cgroup_free(memcg);
6123 return ERR_PTR(error);
6124}
6125
6126static int
6127mem_cgroup_css_online(struct cgroup *cont)
6128{
6129 struct mem_cgroup *memcg, *parent;
6130 int error = 0;
6131
6132 if (!cont->parent)
6133 return 0;
6134
6135 mutex_lock(&memcg_create_mutex);
6136 memcg = mem_cgroup_from_cont(cont);
6137 parent = mem_cgroup_from_cont(cont->parent);
6138
6139 memcg->use_hierarchy = parent->use_hierarchy;
6140 memcg->oom_kill_disable = parent->oom_kill_disable;
6141 memcg->swappiness = mem_cgroup_swappiness(parent);
6142
6143 if (parent->use_hierarchy) {
6102 res_counter_init(&memcg->res, &parent->res); 6144 res_counter_init(&memcg->res, &parent->res);
6103 res_counter_init(&memcg->memsw, &parent->memsw); 6145 res_counter_init(&memcg->memsw, &parent->memsw);
6104 res_counter_init(&memcg->kmem, &parent->kmem); 6146 res_counter_init(&memcg->kmem, &parent->kmem);
@@ -6119,20 +6161,12 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6119 * much sense so let cgroup subsystem know about this 6161 * much sense so let cgroup subsystem know about this
6120 * unfortunate state in our controller. 6162 * unfortunate state in our controller.
6121 */ 6163 */
6122 if (parent && parent != root_mem_cgroup) 6164 if (parent != root_mem_cgroup)
6123 mem_cgroup_subsys.broken_hierarchy = true; 6165 mem_cgroup_subsys.broken_hierarchy = true;
6124 } 6166 }
6125 memcg->last_scanned_node = MAX_NUMNODES;
6126 INIT_LIST_HEAD(&memcg->oom_notify);
6127
6128 if (parent)
6129 memcg->swappiness = mem_cgroup_swappiness(parent);
6130 atomic_set(&memcg->refcnt, 1);
6131 memcg->move_charge_at_immigrate = 0;
6132 mutex_init(&memcg->thresholds_lock);
6133 spin_lock_init(&memcg->move_lock);
6134 6167
6135 error = memcg_init_kmem(memcg, &mem_cgroup_subsys); 6168 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
6169 mutex_unlock(&memcg_create_mutex);
6136 if (error) { 6170 if (error) {
6137 /* 6171 /*
6138 * We call put now because our (and parent's) refcnts 6172 * We call put now because our (and parent's) refcnts
@@ -6140,12 +6174,10 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6140 * call __mem_cgroup_free, so return directly 6174 * call __mem_cgroup_free, so return directly
6141 */ 6175 */
6142 mem_cgroup_put(memcg); 6176 mem_cgroup_put(memcg);
6143 return ERR_PTR(error); 6177 if (parent->use_hierarchy)
6178 mem_cgroup_put(parent);
6144 } 6179 }
6145 return &memcg->css; 6180 return error;
6146free_out:
6147 __mem_cgroup_free(memcg);
6148 return ERR_PTR(error);
6149} 6181}
6150 6182
6151static void mem_cgroup_css_offline(struct cgroup *cont) 6183static void mem_cgroup_css_offline(struct cgroup *cont)
@@ -6281,7 +6313,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6281 * Because lookup_swap_cache() updates some statistics counter, 6313 * Because lookup_swap_cache() updates some statistics counter,
6282 * we call find_get_page() with swapper_space directly. 6314 * we call find_get_page() with swapper_space directly.
6283 */ 6315 */
6284 page = find_get_page(&swapper_space, ent.val); 6316 page = find_get_page(swap_address_space(ent), ent.val);
6285 if (do_swap_account) 6317 if (do_swap_account)
6286 entry->val = ent.val; 6318 entry->val = ent.val;
6287 6319
@@ -6322,7 +6354,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
6322 swp_entry_t swap = radix_to_swp_entry(page); 6354 swp_entry_t swap = radix_to_swp_entry(page);
6323 if (do_swap_account) 6355 if (do_swap_account)
6324 *entry = swap; 6356 *entry = swap;
6325 page = find_get_page(&swapper_space, swap.val); 6357 page = find_get_page(swap_address_space(swap), swap.val);
6326 } 6358 }
6327#endif 6359#endif
6328 return page; 6360 return page;
@@ -6532,8 +6564,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
6532 struct task_struct *p = cgroup_taskset_first(tset); 6564 struct task_struct *p = cgroup_taskset_first(tset);
6533 int ret = 0; 6565 int ret = 0;
6534 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); 6566 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
6567 unsigned long move_charge_at_immigrate;
6535 6568
6536 if (memcg->move_charge_at_immigrate) { 6569 /*
6570 * We are now commited to this value whatever it is. Changes in this
6571 * tunable will only affect upcoming migrations, not the current one.
6572 * So we need to save it, and keep it going.
6573 */
6574 move_charge_at_immigrate = memcg->move_charge_at_immigrate;
6575 if (move_charge_at_immigrate) {
6537 struct mm_struct *mm; 6576 struct mm_struct *mm;
6538 struct mem_cgroup *from = mem_cgroup_from_task(p); 6577 struct mem_cgroup *from = mem_cgroup_from_task(p);
6539 6578
@@ -6553,6 +6592,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup,
6553 spin_lock(&mc.lock); 6592 spin_lock(&mc.lock);
6554 mc.from = from; 6593 mc.from = from;
6555 mc.to = memcg; 6594 mc.to = memcg;
6595 mc.immigrate_flags = move_charge_at_immigrate;
6556 spin_unlock(&mc.lock); 6596 spin_unlock(&mc.lock);
6557 /* We set mc.moving_task later */ 6597 /* We set mc.moving_task later */
6558 6598
@@ -6747,6 +6787,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
6747 .name = "memory", 6787 .name = "memory",
6748 .subsys_id = mem_cgroup_subsys_id, 6788 .subsys_id = mem_cgroup_subsys_id,
6749 .css_alloc = mem_cgroup_css_alloc, 6789 .css_alloc = mem_cgroup_css_alloc,
6790 .css_online = mem_cgroup_css_online,
6750 .css_offline = mem_cgroup_css_offline, 6791 .css_offline = mem_cgroup_css_offline,
6751 .css_free = mem_cgroup_css_free, 6792 .css_free = mem_cgroup_css_free,
6752 .can_attach = mem_cgroup_can_attach, 6793 .can_attach = mem_cgroup_can_attach,
@@ -6757,19 +6798,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
6757 .use_id = 1, 6798 .use_id = 1,
6758}; 6799};
6759 6800
6760/*
6761 * The rest of init is performed during ->css_alloc() for root css which
6762 * happens before initcalls. hotcpu_notifier() can't be done together as
6763 * it would introduce circular locking by adding cgroup_lock -> cpu hotplug
6764 * dependency. Do it from a subsys_initcall().
6765 */
6766static int __init mem_cgroup_init(void)
6767{
6768 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6769 return 0;
6770}
6771subsys_initcall(mem_cgroup_init);
6772
6773#ifdef CONFIG_MEMCG_SWAP 6801#ifdef CONFIG_MEMCG_SWAP
6774static int __init enable_swap_account(char *s) 6802static int __init enable_swap_account(char *s)
6775{ 6803{
@@ -6782,4 +6810,39 @@ static int __init enable_swap_account(char *s)
6782} 6810}
6783__setup("swapaccount=", enable_swap_account); 6811__setup("swapaccount=", enable_swap_account);
6784 6812
6813static void __init memsw_file_init(void)
6814{
6815 WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
6816}
6817
6818static void __init enable_swap_cgroup(void)
6819{
6820 if (!mem_cgroup_disabled() && really_do_swap_account) {
6821 do_swap_account = 1;
6822 memsw_file_init();
6823 }
6824}
6825
6826#else
6827static void __init enable_swap_cgroup(void)
6828{
6829}
6785#endif 6830#endif
6831
6832/*
6833 * subsys_initcall() for memory controller.
6834 *
6835 * Some parts like hotcpu_notifier() have to be initialized from this context
6836 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
6837 * everything that doesn't depend on a specific mem_cgroup structure should
6838 * be initialized from here.
6839 */
6840static int __init mem_cgroup_init(void)
6841{
6842 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6843 enable_swap_cgroup();
6844 mem_cgroup_soft_limit_tree_init();
6845 memcg_stock_init();
6846 return 0;
6847}
6848subsys_initcall(mem_cgroup_init);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c6e4dd3e1c08..df0694c6adef 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0;
61 61
62int sysctl_memory_failure_recovery __read_mostly = 1; 62int sysctl_memory_failure_recovery __read_mostly = 1;
63 63
64atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); 64atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
65 65
66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) 66#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
67 67
@@ -784,12 +784,12 @@ static struct page_state {
784 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, 784 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, 785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
786 786
787 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
788 { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
789
790 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, 787 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
791 { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, 788 { mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
792 789
790 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
791 { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
792
793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, 793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
794 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 794 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
795 795
@@ -1021,6 +1021,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1021 struct page *hpage; 1021 struct page *hpage;
1022 int res; 1022 int res;
1023 unsigned int nr_pages; 1023 unsigned int nr_pages;
1024 unsigned long page_flags;
1024 1025
1025 if (!sysctl_memory_failure_recovery) 1026 if (!sysctl_memory_failure_recovery)
1026 panic("Memory failure from trap %d on page %lx", trapno, pfn); 1027 panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -1039,8 +1040,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1039 return 0; 1040 return 0;
1040 } 1041 }
1041 1042
1042 nr_pages = 1 << compound_trans_order(hpage); 1043 /*
1043 atomic_long_add(nr_pages, &mce_bad_pages); 1044 * Currently errors on hugetlbfs pages are measured in hugepage units,
1045 * so nr_pages should be 1 << compound_order. OTOH when errors are on
1046 * transparent hugepages, they are supposed to be split and error
1047 * measurement is done in normal page units. So nr_pages should be one
1048 * in this case.
1049 */
1050 if (PageHuge(p))
1051 nr_pages = 1 << compound_order(hpage);
1052 else /* normal page or thp */
1053 nr_pages = 1;
1054 atomic_long_add(nr_pages, &num_poisoned_pages);
1044 1055
1045 /* 1056 /*
1046 * We need/can do nothing about count=0 pages. 1057 * We need/can do nothing about count=0 pages.
@@ -1070,7 +1081,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1070 if (!PageHWPoison(hpage) 1081 if (!PageHWPoison(hpage)
1071 || (hwpoison_filter(p) && TestClearPageHWPoison(p)) 1082 || (hwpoison_filter(p) && TestClearPageHWPoison(p))
1072 || (p != hpage && TestSetPageHWPoison(hpage))) { 1083 || (p != hpage && TestSetPageHWPoison(hpage))) {
1073 atomic_long_sub(nr_pages, &mce_bad_pages); 1084 atomic_long_sub(nr_pages, &num_poisoned_pages);
1074 return 0; 1085 return 0;
1075 } 1086 }
1076 set_page_hwpoison_huge_page(hpage); 1087 set_page_hwpoison_huge_page(hpage);
@@ -1119,6 +1130,15 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1119 lock_page(hpage); 1130 lock_page(hpage);
1120 1131
1121 /* 1132 /*
1133 * We use page flags to determine what action should be taken, but
1134 * the flags can be modified by the error containment action. One
1135 * example is an mlocked page, where PG_mlocked is cleared by
1136 * page_remove_rmap() in try_to_unmap_one(). So to determine page status
1137 * correctly, we save a copy of the page flags at this time.
1138 */
1139 page_flags = p->flags;
1140
1141 /*
1122 * unpoison always clear PG_hwpoison inside page lock 1142 * unpoison always clear PG_hwpoison inside page lock
1123 */ 1143 */
1124 if (!PageHWPoison(p)) { 1144 if (!PageHWPoison(p)) {
@@ -1128,7 +1148,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1128 } 1148 }
1129 if (hwpoison_filter(p)) { 1149 if (hwpoison_filter(p)) {
1130 if (TestClearPageHWPoison(p)) 1150 if (TestClearPageHWPoison(p))
1131 atomic_long_sub(nr_pages, &mce_bad_pages); 1151 atomic_long_sub(nr_pages, &num_poisoned_pages);
1132 unlock_page(hpage); 1152 unlock_page(hpage);
1133 put_page(hpage); 1153 put_page(hpage);
1134 return 0; 1154 return 0;
@@ -1176,12 +1196,19 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1176 } 1196 }
1177 1197
1178 res = -EBUSY; 1198 res = -EBUSY;
1179 for (ps = error_states;; ps++) { 1199 /*
1180 if ((p->flags & ps->mask) == ps->res) { 1200 * The first check uses the current page flags which may not have any
1181 res = page_action(ps, p, pfn); 1201 * relevant information. The second check with the saved page flagss is
1202 * carried out only if the first check can't determine the page status.
1203 */
1204 for (ps = error_states;; ps++)
1205 if ((p->flags & ps->mask) == ps->res)
1182 break; 1206 break;
1183 } 1207 if (!ps->mask)
1184 } 1208 for (ps = error_states;; ps++)
1209 if ((page_flags & ps->mask) == ps->res)
1210 break;
1211 res = page_action(ps, p, pfn);
1185out: 1212out:
1186 unlock_page(hpage); 1213 unlock_page(hpage);
1187 return res; 1214 return res;
@@ -1323,7 +1350,7 @@ int unpoison_memory(unsigned long pfn)
1323 return 0; 1350 return 0;
1324 } 1351 }
1325 if (TestClearPageHWPoison(p)) 1352 if (TestClearPageHWPoison(p))
1326 atomic_long_sub(nr_pages, &mce_bad_pages); 1353 atomic_long_sub(nr_pages, &num_poisoned_pages);
1327 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); 1354 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1328 return 0; 1355 return 0;
1329 } 1356 }
@@ -1337,7 +1364,7 @@ int unpoison_memory(unsigned long pfn)
1337 */ 1364 */
1338 if (TestClearPageHWPoison(page)) { 1365 if (TestClearPageHWPoison(page)) {
1339 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); 1366 pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
1340 atomic_long_sub(nr_pages, &mce_bad_pages); 1367 atomic_long_sub(nr_pages, &num_poisoned_pages);
1341 freeit = 1; 1368 freeit = 1;
1342 if (PageHuge(page)) 1369 if (PageHuge(page))
1343 clear_page_hwpoison_huge_page(page); 1370 clear_page_hwpoison_huge_page(page);
@@ -1368,7 +1395,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x)
1368 * that is not free, and 1 for any other page type. 1395 * that is not free, and 1 for any other page type.
1369 * For 1 the page is returned with increased page count, otherwise not. 1396 * For 1 the page is returned with increased page count, otherwise not.
1370 */ 1397 */
1371static int get_any_page(struct page *p, unsigned long pfn, int flags) 1398static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1372{ 1399{
1373 int ret; 1400 int ret;
1374 1401
@@ -1393,11 +1420,9 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1393 if (!get_page_unless_zero(compound_head(p))) { 1420 if (!get_page_unless_zero(compound_head(p))) {
1394 if (PageHuge(p)) { 1421 if (PageHuge(p)) {
1395 pr_info("%s: %#lx free huge page\n", __func__, pfn); 1422 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1396 ret = dequeue_hwpoisoned_huge_page(compound_head(p)); 1423 ret = 0;
1397 } else if (is_free_buddy_page(p)) { 1424 } else if (is_free_buddy_page(p)) {
1398 pr_info("%s: %#lx free buddy page\n", __func__, pfn); 1425 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1399 /* Set hwpoison bit while page is still isolated */
1400 SetPageHWPoison(p);
1401 ret = 0; 1426 ret = 0;
1402 } else { 1427 } else {
1403 pr_info("%s: %#lx: unknown zero refcount page type %lx\n", 1428 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
@@ -1413,43 +1438,68 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1413 return ret; 1438 return ret;
1414} 1439}
1415 1440
1441static int get_any_page(struct page *page, unsigned long pfn, int flags)
1442{
1443 int ret = __get_any_page(page, pfn, flags);
1444
1445 if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
1446 /*
1447 * Try to free it.
1448 */
1449 put_page(page);
1450 shake_page(page, 1);
1451
1452 /*
1453 * Did it turn free?
1454 */
1455 ret = __get_any_page(page, pfn, 0);
1456 if (!PageLRU(page)) {
1457 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1458 pfn, page->flags);
1459 return -EIO;
1460 }
1461 }
1462 return ret;
1463}
1464
1416static int soft_offline_huge_page(struct page *page, int flags) 1465static int soft_offline_huge_page(struct page *page, int flags)
1417{ 1466{
1418 int ret; 1467 int ret;
1419 unsigned long pfn = page_to_pfn(page); 1468 unsigned long pfn = page_to_pfn(page);
1420 struct page *hpage = compound_head(page); 1469 struct page *hpage = compound_head(page);
1421 1470
1422 ret = get_any_page(page, pfn, flags); 1471 /*
1423 if (ret < 0) 1472 * This double-check of PageHWPoison is to avoid the race with
1424 return ret; 1473 * memory_failure(). See also comment in __soft_offline_page().
1425 if (ret == 0) 1474 */
1426 goto done; 1475 lock_page(hpage);
1427
1428 if (PageHWPoison(hpage)) { 1476 if (PageHWPoison(hpage)) {
1477 unlock_page(hpage);
1429 put_page(hpage); 1478 put_page(hpage);
1430 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); 1479 pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
1431 return -EBUSY; 1480 return -EBUSY;
1432 } 1481 }
1482 unlock_page(hpage);
1433 1483
1434 /* Keep page count to indicate a given hugepage is isolated. */ 1484 /* Keep page count to indicate a given hugepage is isolated. */
1435 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, 1485 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL,
1436 MIGRATE_SYNC); 1486 MIGRATE_SYNC);
1437 put_page(hpage); 1487 put_page(hpage);
1438 if (ret) { 1488 if (ret) {
1439 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1489 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1440 pfn, ret, page->flags); 1490 pfn, ret, page->flags);
1441 return ret; 1491 } else {
1442 } 1492 set_page_hwpoison_huge_page(hpage);
1443done: 1493 dequeue_hwpoisoned_huge_page(hpage);
1444 if (!PageHWPoison(hpage))
1445 atomic_long_add(1 << compound_trans_order(hpage), 1494 atomic_long_add(1 << compound_trans_order(hpage),
1446 &mce_bad_pages); 1495 &num_poisoned_pages);
1447 set_page_hwpoison_huge_page(hpage); 1496 }
1448 dequeue_hwpoisoned_huge_page(hpage);
1449 /* keep elevated page count for bad page */ 1497 /* keep elevated page count for bad page */
1450 return ret; 1498 return ret;
1451} 1499}
1452 1500
1501static int __soft_offline_page(struct page *page, int flags);
1502
1453/** 1503/**
1454 * soft_offline_page - Soft offline a page. 1504 * soft_offline_page - Soft offline a page.
1455 * @page: page to offline 1505 * @page: page to offline
@@ -1478,9 +1528,11 @@ int soft_offline_page(struct page *page, int flags)
1478 unsigned long pfn = page_to_pfn(page); 1528 unsigned long pfn = page_to_pfn(page);
1479 struct page *hpage = compound_trans_head(page); 1529 struct page *hpage = compound_trans_head(page);
1480 1530
1481 if (PageHuge(page)) 1531 if (PageHWPoison(page)) {
1482 return soft_offline_huge_page(page, flags); 1532 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1483 if (PageTransHuge(hpage)) { 1533 return -EBUSY;
1534 }
1535 if (!PageHuge(page) && PageTransHuge(hpage)) {
1484 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { 1536 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1485 pr_info("soft offline: %#lx: failed to split THP\n", 1537 pr_info("soft offline: %#lx: failed to split THP\n",
1486 pfn); 1538 pfn);
@@ -1491,47 +1543,45 @@ int soft_offline_page(struct page *page, int flags)
1491 ret = get_any_page(page, pfn, flags); 1543 ret = get_any_page(page, pfn, flags);
1492 if (ret < 0) 1544 if (ret < 0)
1493 return ret; 1545 return ret;
1494 if (ret == 0) 1546 if (ret) { /* for in-use pages */
1495 goto done; 1547 if (PageHuge(page))
1496 1548 ret = soft_offline_huge_page(page, flags);
1497 /* 1549 else
1498 * Page cache page we can handle? 1550 ret = __soft_offline_page(page, flags);
1499 */ 1551 } else { /* for free pages */
1500 if (!PageLRU(page)) { 1552 if (PageHuge(page)) {
1501 /* 1553 set_page_hwpoison_huge_page(hpage);
1502 * Try to free it. 1554 dequeue_hwpoisoned_huge_page(hpage);
1503 */ 1555 atomic_long_add(1 << compound_trans_order(hpage),
1504 put_page(page); 1556 &num_poisoned_pages);
1505 shake_page(page, 1); 1557 } else {
1506 1558 SetPageHWPoison(page);
1507 /* 1559 atomic_long_inc(&num_poisoned_pages);
1508 * Did it turn free? 1560 }
1509 */
1510 ret = get_any_page(page, pfn, 0);
1511 if (ret < 0)
1512 return ret;
1513 if (ret == 0)
1514 goto done;
1515 }
1516 if (!PageLRU(page)) {
1517 pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
1518 pfn, page->flags);
1519 return -EIO;
1520 } 1561 }
1562 /* keep elevated page count for bad page */
1563 return ret;
1564}
1521 1565
1522 lock_page(page); 1566static int __soft_offline_page(struct page *page, int flags)
1523 wait_on_page_writeback(page); 1567{
1568 int ret;
1569 unsigned long pfn = page_to_pfn(page);
1524 1570
1525 /* 1571 /*
1526 * Synchronized using the page lock with memory_failure() 1572 * Check PageHWPoison again inside page lock because PageHWPoison
1573 * is set by memory_failure() outside page lock. Note that
1574 * memory_failure() also double-checks PageHWPoison inside page lock,
1575 * so there's no race between soft_offline_page() and memory_failure().
1527 */ 1576 */
1577 lock_page(page);
1578 wait_on_page_writeback(page);
1528 if (PageHWPoison(page)) { 1579 if (PageHWPoison(page)) {
1529 unlock_page(page); 1580 unlock_page(page);
1530 put_page(page); 1581 put_page(page);
1531 pr_info("soft offline: %#lx page already poisoned\n", pfn); 1582 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1532 return -EBUSY; 1583 return -EBUSY;
1533 } 1584 }
1534
1535 /* 1585 /*
1536 * Try to invalidate first. This should work for 1586 * Try to invalidate first. This should work for
1537 * non dirty unmapped page cache pages. 1587 * non dirty unmapped page cache pages.
@@ -1544,9 +1594,10 @@ int soft_offline_page(struct page *page, int flags)
1544 */ 1594 */
1545 if (ret == 1) { 1595 if (ret == 1) {
1546 put_page(page); 1596 put_page(page);
1547 ret = 0;
1548 pr_info("soft_offline: %#lx: invalidated\n", pfn); 1597 pr_info("soft_offline: %#lx: invalidated\n", pfn);
1549 goto done; 1598 SetPageHWPoison(page);
1599 atomic_long_inc(&num_poisoned_pages);
1600 return 0;
1550 } 1601 }
1551 1602
1552 /* 1603 /*
@@ -1563,28 +1614,23 @@ int soft_offline_page(struct page *page, int flags)
1563 if (!ret) { 1614 if (!ret) {
1564 LIST_HEAD(pagelist); 1615 LIST_HEAD(pagelist);
1565 inc_zone_page_state(page, NR_ISOLATED_ANON + 1616 inc_zone_page_state(page, NR_ISOLATED_ANON +
1566 page_is_file_cache(page)); 1617 page_is_file_cache(page));
1567 list_add(&page->lru, &pagelist); 1618 list_add(&page->lru, &pagelist);
1568 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1619 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1569 false, MIGRATE_SYNC, 1620 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1570 MR_MEMORY_FAILURE);
1571 if (ret) { 1621 if (ret) {
1572 putback_lru_pages(&pagelist); 1622 putback_lru_pages(&pagelist);
1573 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1623 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1574 pfn, ret, page->flags); 1624 pfn, ret, page->flags);
1575 if (ret > 0) 1625 if (ret > 0)
1576 ret = -EIO; 1626 ret = -EIO;
1627 } else {
1628 SetPageHWPoison(page);
1629 atomic_long_inc(&num_poisoned_pages);
1577 } 1630 }
1578 } else { 1631 } else {
1579 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", 1632 pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
1580 pfn, ret, page_count(page), page->flags); 1633 pfn, ret, page_count(page), page->flags);
1581 } 1634 }
1582 if (ret)
1583 return ret;
1584
1585done:
1586 atomic_long_add(1, &mce_bad_pages);
1587 SetPageHWPoison(page);
1588 /* keep elevated page count for bad page */
1589 return ret; 1635 return ret;
1590} 1636}
diff --git a/mm/memory.c b/mm/memory.c
index bb1369f7b9b4..705473afc1f4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,6 +69,10 @@
69 69
70#include "internal.h" 70#include "internal.h"
71 71
72#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
73#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
74#endif
75
72#ifndef CONFIG_NEED_MULTIPLE_NODES 76#ifndef CONFIG_NEED_MULTIPLE_NODES
73/* use the per-pgdat data instead for discontigmem - mbligh */ 77/* use the per-pgdat data instead for discontigmem - mbligh */
74unsigned long max_mapnr; 78unsigned long max_mapnr;
@@ -1458,10 +1462,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1458EXPORT_SYMBOL_GPL(zap_vma_ptes); 1462EXPORT_SYMBOL_GPL(zap_vma_ptes);
1459 1463
1460/** 1464/**
1461 * follow_page - look up a page descriptor from a user-virtual address 1465 * follow_page_mask - look up a page descriptor from a user-virtual address
1462 * @vma: vm_area_struct mapping @address 1466 * @vma: vm_area_struct mapping @address
1463 * @address: virtual address to look up 1467 * @address: virtual address to look up
1464 * @flags: flags modifying lookup behaviour 1468 * @flags: flags modifying lookup behaviour
1469 * @page_mask: on output, *page_mask is set according to the size of the page
1465 * 1470 *
1466 * @flags can have FOLL_ flags set, defined in <linux/mm.h> 1471 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
1467 * 1472 *
@@ -1469,8 +1474,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes);
1469 * an error pointer if there is a mapping to something not represented 1474 * an error pointer if there is a mapping to something not represented
1470 * by a page descriptor (see also vm_normal_page()). 1475 * by a page descriptor (see also vm_normal_page()).
1471 */ 1476 */
1472struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1477struct page *follow_page_mask(struct vm_area_struct *vma,
1473 unsigned int flags) 1478 unsigned long address, unsigned int flags,
1479 unsigned int *page_mask)
1474{ 1480{
1475 pgd_t *pgd; 1481 pgd_t *pgd;
1476 pud_t *pud; 1482 pud_t *pud;
@@ -1480,6 +1486,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1480 struct page *page; 1486 struct page *page;
1481 struct mm_struct *mm = vma->vm_mm; 1487 struct mm_struct *mm = vma->vm_mm;
1482 1488
1489 *page_mask = 0;
1490
1483 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 1491 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1484 if (!IS_ERR(page)) { 1492 if (!IS_ERR(page)) {
1485 BUG_ON(flags & FOLL_GET); 1493 BUG_ON(flags & FOLL_GET);
@@ -1526,6 +1534,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1526 page = follow_trans_huge_pmd(vma, address, 1534 page = follow_trans_huge_pmd(vma, address,
1527 pmd, flags); 1535 pmd, flags);
1528 spin_unlock(&mm->page_table_lock); 1536 spin_unlock(&mm->page_table_lock);
1537 *page_mask = HPAGE_PMD_NR - 1;
1529 goto out; 1538 goto out;
1530 } 1539 }
1531 } else 1540 } else
@@ -1539,8 +1548,24 @@ split_fallthrough:
1539 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 1548 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1540 1549
1541 pte = *ptep; 1550 pte = *ptep;
1542 if (!pte_present(pte)) 1551 if (!pte_present(pte)) {
1543 goto no_page; 1552 swp_entry_t entry;
1553 /*
1554 * KSM's break_ksm() relies upon recognizing a ksm page
1555 * even while it is being migrated, so for that case we
1556 * need migration_entry_wait().
1557 */
1558 if (likely(!(flags & FOLL_MIGRATION)))
1559 goto no_page;
1560 if (pte_none(pte) || pte_file(pte))
1561 goto no_page;
1562 entry = pte_to_swp_entry(pte);
1563 if (!is_migration_entry(entry))
1564 goto no_page;
1565 pte_unmap_unlock(ptep, ptl);
1566 migration_entry_wait(mm, pmd, address);
1567 goto split_fallthrough;
1568 }
1544 if ((flags & FOLL_NUMA) && pte_numa(pte)) 1569 if ((flags & FOLL_NUMA) && pte_numa(pte))
1545 goto no_page; 1570 goto no_page;
1546 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1571 if ((flags & FOLL_WRITE) && !pte_write(pte))
@@ -1673,15 +1698,16 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
1673 * instead of __get_user_pages. __get_user_pages should be used only if 1698 * instead of __get_user_pages. __get_user_pages should be used only if
1674 * you need some special @gup_flags. 1699 * you need some special @gup_flags.
1675 */ 1700 */
1676int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1701long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1677 unsigned long start, int nr_pages, unsigned int gup_flags, 1702 unsigned long start, unsigned long nr_pages,
1678 struct page **pages, struct vm_area_struct **vmas, 1703 unsigned int gup_flags, struct page **pages,
1679 int *nonblocking) 1704 struct vm_area_struct **vmas, int *nonblocking)
1680{ 1705{
1681 int i; 1706 long i;
1682 unsigned long vm_flags; 1707 unsigned long vm_flags;
1708 unsigned int page_mask;
1683 1709
1684 if (nr_pages <= 0) 1710 if (!nr_pages)
1685 return 0; 1711 return 0;
1686 1712
1687 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); 1713 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
@@ -1757,6 +1783,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1757 get_page(page); 1783 get_page(page);
1758 } 1784 }
1759 pte_unmap(pte); 1785 pte_unmap(pte);
1786 page_mask = 0;
1760 goto next_page; 1787 goto next_page;
1761 } 1788 }
1762 1789
@@ -1774,6 +1801,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1774 do { 1801 do {
1775 struct page *page; 1802 struct page *page;
1776 unsigned int foll_flags = gup_flags; 1803 unsigned int foll_flags = gup_flags;
1804 unsigned int page_increm;
1777 1805
1778 /* 1806 /*
1779 * If we have a pending SIGKILL, don't keep faulting 1807 * If we have a pending SIGKILL, don't keep faulting
@@ -1783,7 +1811,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1783 return i ? i : -ERESTARTSYS; 1811 return i ? i : -ERESTARTSYS;
1784 1812
1785 cond_resched(); 1813 cond_resched();
1786 while (!(page = follow_page(vma, start, foll_flags))) { 1814 while (!(page = follow_page_mask(vma, start,
1815 foll_flags, &page_mask))) {
1787 int ret; 1816 int ret;
1788 unsigned int fault_flags = 0; 1817 unsigned int fault_flags = 0;
1789 1818
@@ -1857,13 +1886,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1857 1886
1858 flush_anon_page(vma, page, start); 1887 flush_anon_page(vma, page, start);
1859 flush_dcache_page(page); 1888 flush_dcache_page(page);
1889 page_mask = 0;
1860 } 1890 }
1861next_page: 1891next_page:
1862 if (vmas) 1892 if (vmas) {
1863 vmas[i] = vma; 1893 vmas[i] = vma;
1864 i++; 1894 page_mask = 0;
1865 start += PAGE_SIZE; 1895 }
1866 nr_pages--; 1896 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
1897 if (page_increm > nr_pages)
1898 page_increm = nr_pages;
1899 i += page_increm;
1900 start += page_increm * PAGE_SIZE;
1901 nr_pages -= page_increm;
1867 } while (nr_pages && start < vma->vm_end); 1902 } while (nr_pages && start < vma->vm_end);
1868 } while (nr_pages); 1903 } while (nr_pages);
1869 return i; 1904 return i;
@@ -1977,9 +2012,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1977 * 2012 *
1978 * See also get_user_pages_fast, for performance critical applications. 2013 * See also get_user_pages_fast, for performance critical applications.
1979 */ 2014 */
1980int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 2015long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1981 unsigned long start, int nr_pages, int write, int force, 2016 unsigned long start, unsigned long nr_pages, int write,
1982 struct page **pages, struct vm_area_struct **vmas) 2017 int force, struct page **pages, struct vm_area_struct **vmas)
1983{ 2018{
1984 int flags = FOLL_TOUCH; 2019 int flags = FOLL_TOUCH;
1985 2020
@@ -2919,7 +2954,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2919 unsigned int flags, pte_t orig_pte) 2954 unsigned int flags, pte_t orig_pte)
2920{ 2955{
2921 spinlock_t *ptl; 2956 spinlock_t *ptl;
2922 struct page *page, *swapcache = NULL; 2957 struct page *page, *swapcache;
2923 swp_entry_t entry; 2958 swp_entry_t entry;
2924 pte_t pte; 2959 pte_t pte;
2925 int locked; 2960 int locked;
@@ -2970,9 +3005,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2970 */ 3005 */
2971 ret = VM_FAULT_HWPOISON; 3006 ret = VM_FAULT_HWPOISON;
2972 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 3007 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
3008 swapcache = page;
2973 goto out_release; 3009 goto out_release;
2974 } 3010 }
2975 3011
3012 swapcache = page;
2976 locked = lock_page_or_retry(page, mm, flags); 3013 locked = lock_page_or_retry(page, mm, flags);
2977 3014
2978 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 3015 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2990,16 +3027,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2990 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) 3027 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2991 goto out_page; 3028 goto out_page;
2992 3029
2993 if (ksm_might_need_to_copy(page, vma, address)) { 3030 page = ksm_might_need_to_copy(page, vma, address);
2994 swapcache = page; 3031 if (unlikely(!page)) {
2995 page = ksm_does_need_to_copy(page, vma, address); 3032 ret = VM_FAULT_OOM;
2996 3033 page = swapcache;
2997 if (unlikely(!page)) { 3034 goto out_page;
2998 ret = VM_FAULT_OOM;
2999 page = swapcache;
3000 swapcache = NULL;
3001 goto out_page;
3002 }
3003 } 3035 }
3004 3036
3005 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 3037 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -3044,7 +3076,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
3044 } 3076 }
3045 flush_icache_page(vma, page); 3077 flush_icache_page(vma, page);
3046 set_pte_at(mm, address, page_table, pte); 3078 set_pte_at(mm, address, page_table, pte);
3047 do_page_add_anon_rmap(page, vma, address, exclusive); 3079 if (page == swapcache)
3080 do_page_add_anon_rmap(page, vma, address, exclusive);
3081 else /* ksm created a completely new copy */
3082 page_add_new_anon_rmap(page, vma, address);
3048 /* It's better to call commit-charge after rmap is established */ 3083 /* It's better to call commit-charge after rmap is established */
3049 mem_cgroup_commit_charge_swapin(page, ptr); 3084 mem_cgroup_commit_charge_swapin(page, ptr);
3050 3085
@@ -3052,7 +3087,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
3052 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 3087 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
3053 try_to_free_swap(page); 3088 try_to_free_swap(page);
3054 unlock_page(page); 3089 unlock_page(page);
3055 if (swapcache) { 3090 if (page != swapcache) {
3056 /* 3091 /*
3057 * Hold the lock to avoid the swap entry to be reused 3092 * Hold the lock to avoid the swap entry to be reused
3058 * until we take the PT lock for the pte_same() check 3093 * until we take the PT lock for the pte_same() check
@@ -3085,7 +3120,7 @@ out_page:
3085 unlock_page(page); 3120 unlock_page(page);
3086out_release: 3121out_release:
3087 page_cache_release(page); 3122 page_cache_release(page);
3088 if (swapcache) { 3123 if (page != swapcache) {
3089 unlock_page(swapcache); 3124 unlock_page(swapcache);
3090 page_cache_release(swapcache); 3125 page_cache_release(swapcache);
3091 } 3126 }
@@ -3821,30 +3856,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3821} 3856}
3822#endif /* __PAGETABLE_PMD_FOLDED */ 3857#endif /* __PAGETABLE_PMD_FOLDED */
3823 3858
3824int make_pages_present(unsigned long addr, unsigned long end)
3825{
3826 int ret, len, write;
3827 struct vm_area_struct * vma;
3828
3829 vma = find_vma(current->mm, addr);
3830 if (!vma)
3831 return -ENOMEM;
3832 /*
3833 * We want to touch writable mappings with a write fault in order
3834 * to break COW, except for shared mappings because these don't COW
3835 * and we would not want to dirty them for nothing.
3836 */
3837 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3838 BUG_ON(addr >= end);
3839 BUG_ON(end > vma->vm_end);
3840 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3841 ret = get_user_pages(current, current->mm, addr,
3842 len, write, 0, NULL, NULL);
3843 if (ret < 0)
3844 return ret;
3845 return ret == len ? 0 : -EFAULT;
3846}
3847
3848#if !defined(__HAVE_ARCH_GATE_AREA) 3859#if !defined(__HAVE_ARCH_GATE_AREA)
3849 3860
3850#if defined(AT_SYSINFO_EHDR) 3861#if defined(AT_SYSINFO_EHDR)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d04ed87bfacb..b81a367b9f39 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,7 @@
29#include <linux/suspend.h> 29#include <linux/suspend.h>
30#include <linux/mm_inline.h> 30#include <linux/mm_inline.h>
31#include <linux/firmware-map.h> 31#include <linux/firmware-map.h>
32#include <linux/stop_machine.h>
32 33
33#include <asm/tlbflush.h> 34#include <asm/tlbflush.h>
34 35
@@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res)
91} 92}
92 93
93#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 94#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
94#ifndef CONFIG_SPARSEMEM_VMEMMAP 95void get_page_bootmem(unsigned long info, struct page *page,
95static void get_page_bootmem(unsigned long info, struct page *page, 96 unsigned long type)
96 unsigned long type)
97{ 97{
98 page->lru.next = (struct list_head *) type; 98 page->lru.next = (struct list_head *) type;
99 SetPagePrivate(page); 99 SetPagePrivate(page);
@@ -124,10 +124,13 @@ void __ref put_page_bootmem(struct page *page)
124 mutex_lock(&ppb_lock); 124 mutex_lock(&ppb_lock);
125 __free_pages_bootmem(page, 0); 125 __free_pages_bootmem(page, 0);
126 mutex_unlock(&ppb_lock); 126 mutex_unlock(&ppb_lock);
127 totalram_pages++;
127 } 128 }
128 129
129} 130}
130 131
132#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
133#ifndef CONFIG_SPARSEMEM_VMEMMAP
131static void register_page_bootmem_info_section(unsigned long start_pfn) 134static void register_page_bootmem_info_section(unsigned long start_pfn)
132{ 135{
133 unsigned long *usemap, mapsize, section_nr, i; 136 unsigned long *usemap, mapsize, section_nr, i;
@@ -161,6 +164,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
161 get_page_bootmem(section_nr, page, MIX_SECTION_INFO); 164 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
162 165
163} 166}
167#else /* CONFIG_SPARSEMEM_VMEMMAP */
168static void register_page_bootmem_info_section(unsigned long start_pfn)
169{
170 unsigned long *usemap, mapsize, section_nr, i;
171 struct mem_section *ms;
172 struct page *page, *memmap;
173
174 if (!pfn_valid(start_pfn))
175 return;
176
177 section_nr = pfn_to_section_nr(start_pfn);
178 ms = __nr_to_section(section_nr);
179
180 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
181
182 register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
183
184 usemap = __nr_to_section(section_nr)->pageblock_flags;
185 page = virt_to_page(usemap);
186
187 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
188
189 for (i = 0; i < mapsize; i++, page++)
190 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
191}
192#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
164 193
165void register_page_bootmem_info_node(struct pglist_data *pgdat) 194void register_page_bootmem_info_node(struct pglist_data *pgdat)
166{ 195{
@@ -189,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
189 } 218 }
190 219
191 pfn = pgdat->node_start_pfn; 220 pfn = pgdat->node_start_pfn;
192 end_pfn = pfn + pgdat->node_spanned_pages; 221 end_pfn = pgdat_end_pfn(pgdat);
193 222
194 /* register_section info */ 223 /* register_section info */
195 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 224 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
@@ -203,7 +232,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
203 register_page_bootmem_info_section(pfn); 232 register_page_bootmem_info_section(pfn);
204 } 233 }
205} 234}
206#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 235#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
207 236
208static void grow_zone_span(struct zone *zone, unsigned long start_pfn, 237static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
209 unsigned long end_pfn) 238 unsigned long end_pfn)
@@ -253,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
253 set_page_links(pfn_to_page(pfn), zid, nid, pfn); 282 set_page_links(pfn_to_page(pfn), zid, nid, pfn);
254} 283}
255 284
285/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
286 * alloc_bootmem_node_nopanic() */
287static int __ref ensure_zone_is_initialized(struct zone *zone,
288 unsigned long start_pfn, unsigned long num_pages)
289{
290 if (!zone_is_initialized(zone))
291 return init_currently_empty_zone(zone, start_pfn, num_pages,
292 MEMMAP_HOTPLUG);
293 return 0;
294}
295
256static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, 296static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
257 unsigned long start_pfn, unsigned long end_pfn) 297 unsigned long start_pfn, unsigned long end_pfn)
258{ 298{
@@ -260,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
260 unsigned long flags; 300 unsigned long flags;
261 unsigned long z1_start_pfn; 301 unsigned long z1_start_pfn;
262 302
263 if (!z1->wait_table) { 303 ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
264 ret = init_currently_empty_zone(z1, start_pfn, 304 if (ret)
265 end_pfn - start_pfn, MEMMAP_HOTPLUG); 305 return ret;
266 if (ret)
267 return ret;
268 }
269 306
270 pgdat_resize_lock(z1->zone_pgdat, &flags); 307 pgdat_resize_lock(z1->zone_pgdat, &flags);
271 308
272 /* can't move pfns which are higher than @z2 */ 309 /* can't move pfns which are higher than @z2 */
273 if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) 310 if (end_pfn > zone_end_pfn(z2))
274 goto out_fail; 311 goto out_fail;
275 /* the move out part mast at the left most of @z2 */ 312 /* the move out part mast at the left most of @z2 */
276 if (start_pfn > z2->zone_start_pfn) 313 if (start_pfn > z2->zone_start_pfn)
@@ -286,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
286 z1_start_pfn = start_pfn; 323 z1_start_pfn = start_pfn;
287 324
288 resize_zone(z1, z1_start_pfn, end_pfn); 325 resize_zone(z1, z1_start_pfn, end_pfn);
289 resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); 326 resize_zone(z2, end_pfn, zone_end_pfn(z2));
290 327
291 pgdat_resize_unlock(z1->zone_pgdat, &flags); 328 pgdat_resize_unlock(z1->zone_pgdat, &flags);
292 329
@@ -305,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
305 unsigned long flags; 342 unsigned long flags;
306 unsigned long z2_end_pfn; 343 unsigned long z2_end_pfn;
307 344
308 if (!z2->wait_table) { 345 ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
309 ret = init_currently_empty_zone(z2, start_pfn, 346 if (ret)
310 end_pfn - start_pfn, MEMMAP_HOTPLUG); 347 return ret;
311 if (ret)
312 return ret;
313 }
314 348
315 pgdat_resize_lock(z1->zone_pgdat, &flags); 349 pgdat_resize_lock(z1->zone_pgdat, &flags);
316 350
@@ -318,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
318 if (z1->zone_start_pfn > start_pfn) 352 if (z1->zone_start_pfn > start_pfn)
319 goto out_fail; 353 goto out_fail;
320 /* the move out part mast at the right most of @z1 */ 354 /* the move out part mast at the right most of @z1 */
321 if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) 355 if (zone_end_pfn(z1) > end_pfn)
322 goto out_fail; 356 goto out_fail;
323 /* must included/overlap */ 357 /* must included/overlap */
324 if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) 358 if (start_pfn >= zone_end_pfn(z1))
325 goto out_fail; 359 goto out_fail;
326 360
327 /* use end_pfn for z2's end_pfn if z2 is empty */ 361 /* use end_pfn for z2's end_pfn if z2 is empty */
328 if (z2->spanned_pages) 362 if (z2->spanned_pages)
329 z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; 363 z2_end_pfn = zone_end_pfn(z2);
330 else 364 else
331 z2_end_pfn = end_pfn; 365 z2_end_pfn = end_pfn;
332 366
@@ -363,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
363 int nid = pgdat->node_id; 397 int nid = pgdat->node_id;
364 int zone_type; 398 int zone_type;
365 unsigned long flags; 399 unsigned long flags;
400 int ret;
366 401
367 zone_type = zone - pgdat->node_zones; 402 zone_type = zone - pgdat->node_zones;
368 if (!zone->wait_table) { 403 ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
369 int ret; 404 if (ret)
405 return ret;
370 406
371 ret = init_currently_empty_zone(zone, phys_start_pfn,
372 nr_pages, MEMMAP_HOTPLUG);
373 if (ret)
374 return ret;
375 }
376 pgdat_resize_lock(zone->zone_pgdat, &flags); 407 pgdat_resize_lock(zone->zone_pgdat, &flags);
377 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); 408 grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
378 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, 409 grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
@@ -405,20 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone,
405 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 436 return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
406} 437}
407 438
408#ifdef CONFIG_SPARSEMEM_VMEMMAP 439/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
409static int __remove_section(struct zone *zone, struct mem_section *ms) 440static int find_smallest_section_pfn(int nid, struct zone *zone,
441 unsigned long start_pfn,
442 unsigned long end_pfn)
443{
444 struct mem_section *ms;
445
446 for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
447 ms = __pfn_to_section(start_pfn);
448
449 if (unlikely(!valid_section(ms)))
450 continue;
451
452 if (unlikely(pfn_to_nid(start_pfn) != nid))
453 continue;
454
455 if (zone && zone != page_zone(pfn_to_page(start_pfn)))
456 continue;
457
458 return start_pfn;
459 }
460
461 return 0;
462}
463
464/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
465static int find_biggest_section_pfn(int nid, struct zone *zone,
466 unsigned long start_pfn,
467 unsigned long end_pfn)
468{
469 struct mem_section *ms;
470 unsigned long pfn;
471
472 /* pfn is the end pfn of a memory section. */
473 pfn = end_pfn - 1;
474 for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
475 ms = __pfn_to_section(pfn);
476
477 if (unlikely(!valid_section(ms)))
478 continue;
479
480 if (unlikely(pfn_to_nid(pfn) != nid))
481 continue;
482
483 if (zone && zone != page_zone(pfn_to_page(pfn)))
484 continue;
485
486 return pfn;
487 }
488
489 return 0;
490}
491
492static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
493 unsigned long end_pfn)
410{ 494{
495 unsigned long zone_start_pfn = zone->zone_start_pfn;
496 unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
497 unsigned long pfn;
498 struct mem_section *ms;
499 int nid = zone_to_nid(zone);
500
501 zone_span_writelock(zone);
502 if (zone_start_pfn == start_pfn) {
503 /*
504 * If the section is smallest section in the zone, it need
505 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
506 * In this case, we find second smallest valid mem_section
507 * for shrinking zone.
508 */
509 pfn = find_smallest_section_pfn(nid, zone, end_pfn,
510 zone_end_pfn);
511 if (pfn) {
512 zone->zone_start_pfn = pfn;
513 zone->spanned_pages = zone_end_pfn - pfn;
514 }
515 } else if (zone_end_pfn == end_pfn) {
516 /*
517 * If the section is biggest section in the zone, it need
518 * shrink zone->spanned_pages.
519 * In this case, we find second biggest valid mem_section for
520 * shrinking zone.
521 */
522 pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
523 start_pfn);
524 if (pfn)
525 zone->spanned_pages = pfn - zone_start_pfn + 1;
526 }
527
411 /* 528 /*
412 * XXX: Freeing memmap with vmemmap is not implement yet. 529 * The section is not biggest or smallest mem_section in the zone, it
413 * This should be removed later. 530 * only creates a hole in the zone. So in this case, we need not
531 * change the zone. But perhaps, the zone has only hole data. Thus
532 * it check the zone has only hole or not.
414 */ 533 */
415 return -EBUSY; 534 pfn = zone_start_pfn;
535 for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
536 ms = __pfn_to_section(pfn);
537
538 if (unlikely(!valid_section(ms)))
539 continue;
540
541 if (page_zone(pfn_to_page(pfn)) != zone)
542 continue;
543
544 /* If the section is current section, it continues the loop */
545 if (start_pfn == pfn)
546 continue;
547
548 /* If we find valid section, we have nothing to do */
549 zone_span_writeunlock(zone);
550 return;
551 }
552
553 /* The zone has no valid section */
554 zone->zone_start_pfn = 0;
555 zone->spanned_pages = 0;
556 zone_span_writeunlock(zone);
416} 557}
417#else 558
418static int __remove_section(struct zone *zone, struct mem_section *ms) 559static void shrink_pgdat_span(struct pglist_data *pgdat,
560 unsigned long start_pfn, unsigned long end_pfn)
561{
562 unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
563 unsigned long pgdat_end_pfn =
564 pgdat->node_start_pfn + pgdat->node_spanned_pages;
565 unsigned long pfn;
566 struct mem_section *ms;
567 int nid = pgdat->node_id;
568
569 if (pgdat_start_pfn == start_pfn) {
570 /*
571 * If the section is smallest section in the pgdat, it need
572 * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
573 * In this case, we find second smallest valid mem_section
574 * for shrinking zone.
575 */
576 pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
577 pgdat_end_pfn);
578 if (pfn) {
579 pgdat->node_start_pfn = pfn;
580 pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
581 }
582 } else if (pgdat_end_pfn == end_pfn) {
583 /*
584 * If the section is biggest section in the pgdat, it need
585 * shrink pgdat->node_spanned_pages.
586 * In this case, we find second biggest valid mem_section for
587 * shrinking zone.
588 */
589 pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
590 start_pfn);
591 if (pfn)
592 pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
593 }
594
595 /*
596 * If the section is not biggest or smallest mem_section in the pgdat,
597 * it only creates a hole in the pgdat. So in this case, we need not
598 * change the pgdat.
599 * But perhaps, the pgdat has only hole data. Thus it check the pgdat
600 * has only hole or not.
601 */
602 pfn = pgdat_start_pfn;
603 for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
604 ms = __pfn_to_section(pfn);
605
606 if (unlikely(!valid_section(ms)))
607 continue;
608
609 if (pfn_to_nid(pfn) != nid)
610 continue;
611
612 /* If the section is current section, it continues the loop */
613 if (start_pfn == pfn)
614 continue;
615
616 /* If we find valid section, we have nothing to do */
617 return;
618 }
619
620 /* The pgdat has no valid section */
621 pgdat->node_start_pfn = 0;
622 pgdat->node_spanned_pages = 0;
623}
624
625static void __remove_zone(struct zone *zone, unsigned long start_pfn)
419{ 626{
420 unsigned long flags;
421 struct pglist_data *pgdat = zone->zone_pgdat; 627 struct pglist_data *pgdat = zone->zone_pgdat;
628 int nr_pages = PAGES_PER_SECTION;
629 int zone_type;
630 unsigned long flags;
631
632 zone_type = zone - pgdat->node_zones;
633
634 pgdat_resize_lock(zone->zone_pgdat, &flags);
635 shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
636 shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
637 pgdat_resize_unlock(zone->zone_pgdat, &flags);
638}
639
640static int __remove_section(struct zone *zone, struct mem_section *ms)
641{
642 unsigned long start_pfn;
643 int scn_nr;
422 int ret = -EINVAL; 644 int ret = -EINVAL;
423 645
424 if (!valid_section(ms)) 646 if (!valid_section(ms))
@@ -428,12 +650,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
428 if (ret) 650 if (ret)
429 return ret; 651 return ret;
430 652
431 pgdat_resize_lock(pgdat, &flags); 653 scn_nr = __section_nr(ms);
654 start_pfn = section_nr_to_pfn(scn_nr);
655 __remove_zone(zone, start_pfn);
656
432 sparse_remove_one_section(zone, ms); 657 sparse_remove_one_section(zone, ms);
433 pgdat_resize_unlock(pgdat, &flags);
434 return 0; 658 return 0;
435} 659}
436#endif
437 660
438/* 661/*
439 * Reasonably generic function for adding memory. It is 662 * Reasonably generic function for adding memory. It is
@@ -797,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
797 unsigned long zholes_size[MAX_NR_ZONES] = {0}; 1020 unsigned long zholes_size[MAX_NR_ZONES] = {0};
798 unsigned long start_pfn = start >> PAGE_SHIFT; 1021 unsigned long start_pfn = start >> PAGE_SHIFT;
799 1022
800 pgdat = arch_alloc_nodedata(nid); 1023 pgdat = NODE_DATA(nid);
801 if (!pgdat) 1024 if (!pgdat) {
802 return NULL; 1025 pgdat = arch_alloc_nodedata(nid);
1026 if (!pgdat)
1027 return NULL;
803 1028
804 arch_refresh_nodedata(nid, pgdat); 1029 arch_refresh_nodedata(nid, pgdat);
1030 }
805 1031
806 /* we can use NODE_DATA(nid) from here */ 1032 /* we can use NODE_DATA(nid) from here */
807 1033
@@ -854,7 +1080,8 @@ out:
854int __ref add_memory(int nid, u64 start, u64 size) 1080int __ref add_memory(int nid, u64 start, u64 size)
855{ 1081{
856 pg_data_t *pgdat = NULL; 1082 pg_data_t *pgdat = NULL;
857 int new_pgdat = 0; 1083 bool new_pgdat;
1084 bool new_node;
858 struct resource *res; 1085 struct resource *res;
859 int ret; 1086 int ret;
860 1087
@@ -865,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size)
865 if (!res) 1092 if (!res)
866 goto out; 1093 goto out;
867 1094
868 if (!node_online(nid)) { 1095 { /* Stupid hack to suppress address-never-null warning */
1096 void *p = NODE_DATA(nid);
1097 new_pgdat = !p;
1098 }
1099 new_node = !node_online(nid);
1100 if (new_node) {
869 pgdat = hotadd_new_pgdat(nid, start); 1101 pgdat = hotadd_new_pgdat(nid, start);
870 ret = -ENOMEM; 1102 ret = -ENOMEM;
871 if (!pgdat) 1103 if (!pgdat)
872 goto error; 1104 goto error;
873 new_pgdat = 1;
874 } 1105 }
875 1106
876 /* call arch's memory hotadd */ 1107 /* call arch's memory hotadd */
@@ -882,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
882 /* we online node here. we can't roll back from here. */ 1113 /* we online node here. we can't roll back from here. */
883 node_set_online(nid); 1114 node_set_online(nid);
884 1115
885 if (new_pgdat) { 1116 if (new_node) {
886 ret = register_one_node(nid); 1117 ret = register_one_node(nid);
887 /* 1118 /*
888 * If sysfs file of new node can't create, cpu on the node 1119 * If sysfs file of new node can't create, cpu on the node
@@ -901,8 +1132,7 @@ error:
901 /* rollback pgdat allocation and others */ 1132 /* rollback pgdat allocation and others */
902 if (new_pgdat) 1133 if (new_pgdat)
903 rollback_node_hotadd(nid, pgdat); 1134 rollback_node_hotadd(nid, pgdat);
904 if (res) 1135 release_memory_resource(res);
905 release_memory_resource(res);
906 1136
907out: 1137out:
908 unlock_memory_hotplug(); 1138 unlock_memory_hotplug();
@@ -1058,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1058 * migrate_pages returns # of failed pages. 1288 * migrate_pages returns # of failed pages.
1059 */ 1289 */
1060 ret = migrate_pages(&source, alloc_migrate_target, 0, 1290 ret = migrate_pages(&source, alloc_migrate_target, 0,
1061 true, MIGRATE_SYNC, 1291 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1062 MR_MEMORY_HOTPLUG);
1063 if (ret) 1292 if (ret)
1064 putback_lru_pages(&source); 1293 putback_lru_pages(&source);
1065 } 1294 }
@@ -1381,17 +1610,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1381 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1610 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1382} 1611}
1383 1612
1384int remove_memory(u64 start, u64 size) 1613/**
1614 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
1615 * @start_pfn: start pfn of the memory range
1616 * @end_pfn: end pft of the memory range
1617 * @arg: argument passed to func
1618 * @func: callback for each memory section walked
1619 *
1620 * This function walks through all present mem sections in range
1621 * [start_pfn, end_pfn) and call func on each mem section.
1622 *
1623 * Returns the return value of func.
1624 */
1625static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1626 void *arg, int (*func)(struct memory_block *, void *))
1385{ 1627{
1386 struct memory_block *mem = NULL; 1628 struct memory_block *mem = NULL;
1387 struct mem_section *section; 1629 struct mem_section *section;
1388 unsigned long start_pfn, end_pfn;
1389 unsigned long pfn, section_nr; 1630 unsigned long pfn, section_nr;
1390 int ret; 1631 int ret;
1391 1632
1392 start_pfn = PFN_DOWN(start);
1393 end_pfn = start_pfn + PFN_DOWN(size);
1394
1395 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 1633 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1396 section_nr = pfn_to_section_nr(pfn); 1634 section_nr = pfn_to_section_nr(pfn);
1397 if (!present_section_nr(section_nr)) 1635 if (!present_section_nr(section_nr))
@@ -1408,7 +1646,7 @@ int remove_memory(u64 start, u64 size)
1408 if (!mem) 1646 if (!mem)
1409 continue; 1647 continue;
1410 1648
1411 ret = offline_memory_block(mem); 1649 ret = func(mem, arg);
1412 if (ret) { 1650 if (ret) {
1413 kobject_put(&mem->dev.kobj); 1651 kobject_put(&mem->dev.kobj);
1414 return ret; 1652 return ret;
@@ -1420,12 +1658,209 @@ int remove_memory(u64 start, u64 size)
1420 1658
1421 return 0; 1659 return 0;
1422} 1660}
1661
1662/**
1663 * offline_memory_block_cb - callback function for offlining memory block
1664 * @mem: the memory block to be offlined
1665 * @arg: buffer to hold error msg
1666 *
1667 * Always return 0, and put the error msg in arg if any.
1668 */
1669static int offline_memory_block_cb(struct memory_block *mem, void *arg)
1670{
1671 int *ret = arg;
1672 int error = offline_memory_block(mem);
1673
1674 if (error != 0 && *ret == 0)
1675 *ret = error;
1676
1677 return 0;
1678}
1679
1680static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
1681{
1682 int ret = !is_memblock_offlined(mem);
1683
1684 if (unlikely(ret))
1685 pr_warn("removing memory fails, because memory "
1686 "[%#010llx-%#010llx] is onlined\n",
1687 PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
1688 PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);
1689
1690 return ret;
1691}
1692
1693static int check_cpu_on_node(void *data)
1694{
1695 struct pglist_data *pgdat = data;
1696 int cpu;
1697
1698 for_each_present_cpu(cpu) {
1699 if (cpu_to_node(cpu) == pgdat->node_id)
1700 /*
1701 * the cpu on this node isn't removed, and we can't
1702 * offline this node.
1703 */
1704 return -EBUSY;
1705 }
1706
1707 return 0;
1708}
1709
1710static void unmap_cpu_on_node(void *data)
1711{
1712#ifdef CONFIG_ACPI_NUMA
1713 struct pglist_data *pgdat = data;
1714 int cpu;
1715
1716 for_each_possible_cpu(cpu)
1717 if (cpu_to_node(cpu) == pgdat->node_id)
1718 numa_clear_node(cpu);
1719#endif
1720}
1721
1722static int check_and_unmap_cpu_on_node(void *data)
1723{
1724 int ret = check_cpu_on_node(data);
1725
1726 if (ret)
1727 return ret;
1728
1729 /*
1730 * the node will be offlined when we come here, so we can clear
1731 * the cpu_to_node() now.
1732 */
1733
1734 unmap_cpu_on_node(data);
1735 return 0;
1736}
1737
1738/* offline the node if all memory sections of this node are removed */
1739void try_offline_node(int nid)
1740{
1741 pg_data_t *pgdat = NODE_DATA(nid);
1742 unsigned long start_pfn = pgdat->node_start_pfn;
1743 unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
1744 unsigned long pfn;
1745 struct page *pgdat_page = virt_to_page(pgdat);
1746 int i;
1747
1748 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1749 unsigned long section_nr = pfn_to_section_nr(pfn);
1750
1751 if (!present_section_nr(section_nr))
1752 continue;
1753
1754 if (pfn_to_nid(pfn) != nid)
1755 continue;
1756
1757 /*
1758 * some memory sections of this node are not removed, and we
1759 * can't offline node now.
1760 */
1761 return;
1762 }
1763
1764 if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL))
1765 return;
1766
1767 /*
1768 * all memory/cpu of this node are removed, we can offline this
1769 * node now.
1770 */
1771 node_set_offline(nid);
1772 unregister_one_node(nid);
1773
1774 if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page))
1775 /* node data is allocated from boot memory */
1776 return;
1777
1778 /* free waittable in each zone */
1779 for (i = 0; i < MAX_NR_ZONES; i++) {
1780 struct zone *zone = pgdat->node_zones + i;
1781
1782 if (zone->wait_table)
1783 vfree(zone->wait_table);
1784 }
1785
1786 /*
1787 * Since there is no way to guarentee the address of pgdat/zone is not
1788 * on stack of any kernel threads or used by other kernel objects
1789 * without reference counting or other symchronizing method, do not
1790 * reset node_data and free pgdat here. Just reset it to 0 and reuse
1791 * the memory when the node is online again.
1792 */
1793 memset(pgdat, 0, sizeof(*pgdat));
1794}
1795EXPORT_SYMBOL(try_offline_node);
1796
1797int __ref remove_memory(int nid, u64 start, u64 size)
1798{
1799 unsigned long start_pfn, end_pfn;
1800 int ret = 0;
1801 int retry = 1;
1802
1803 start_pfn = PFN_DOWN(start);
1804 end_pfn = start_pfn + PFN_DOWN(size);
1805
1806 /*
1807 * When CONFIG_MEMCG is on, one memory block may be used by other
1808 * blocks to store page cgroup when onlining pages. But we don't know
1809 * in what order pages are onlined. So we iterate twice to offline
1810 * memory:
1811 * 1st iterate: offline every non primary memory block.
1812 * 2nd iterate: offline primary (i.e. first added) memory block.
1813 */
1814repeat:
1815 walk_memory_range(start_pfn, end_pfn, &ret,
1816 offline_memory_block_cb);
1817 if (ret) {
1818 if (!retry)
1819 return ret;
1820
1821 retry = 0;
1822 ret = 0;
1823 goto repeat;
1824 }
1825
1826 lock_memory_hotplug();
1827
1828 /*
1829 * we have offlined all memory blocks like this:
1830 * 1. lock memory hotplug
1831 * 2. offline a memory block
1832 * 3. unlock memory hotplug
1833 *
1834 * repeat step1-3 to offline the memory block. All memory blocks
1835 * must be offlined before removing memory. But we don't hold the
1836 * lock in the whole operation. So we should check whether all
1837 * memory blocks are offlined.
1838 */
1839
1840 ret = walk_memory_range(start_pfn, end_pfn, NULL,
1841 is_memblock_offlined_cb);
1842 if (ret) {
1843 unlock_memory_hotplug();
1844 return ret;
1845 }
1846
1847 /* remove memmap entry */
1848 firmware_map_remove(start, start + size, "System RAM");
1849
1850 arch_remove_memory(start, size);
1851
1852 try_offline_node(nid);
1853
1854 unlock_memory_hotplug();
1855
1856 return 0;
1857}
1423#else 1858#else
1424int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1859int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1425{ 1860{
1426 return -EINVAL; 1861 return -EINVAL;
1427} 1862}
1428int remove_memory(u64 start, u64 size) 1863int remove_memory(int nid, u64 start, u64 size)
1429{ 1864{
1430 return -EINVAL; 1865 return -EINVAL;
1431} 1866}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e2df1c1fb41f..31d26637b658 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -26,7 +26,7 @@
26 * the allocation to memory nodes instead 26 * the allocation to memory nodes instead
27 * 27 *
28 * preferred Try a specific node first before normal fallback. 28 * preferred Try a specific node first before normal fallback.
29 * As a special case node -1 here means do the allocation 29 * As a special case NUMA_NO_NODE here means do the allocation
30 * on the local CPU. This is normally identical to default, 30 * on the local CPU. This is normally identical to default,
31 * but useful to set in a VMA when you have a non default 31 * but useful to set in a VMA when you have a non default
32 * process policy. 32 * process policy.
@@ -127,7 +127,7 @@ static struct mempolicy *get_task_policy(struct task_struct *p)
127 127
128 if (!pol) { 128 if (!pol) {
129 node = numa_node_id(); 129 node = numa_node_id();
130 if (node != -1) 130 if (node != NUMA_NO_NODE)
131 pol = &preferred_node_policy[node]; 131 pol = &preferred_node_policy[node];
132 132
133 /* preferred_node_policy is not initialised early in boot */ 133 /* preferred_node_policy is not initialised early in boot */
@@ -161,19 +161,7 @@ static const struct mempolicy_operations {
161/* Check that the nodemask contains at least one populated zone */ 161/* Check that the nodemask contains at least one populated zone */
162static int is_valid_nodemask(const nodemask_t *nodemask) 162static int is_valid_nodemask(const nodemask_t *nodemask)
163{ 163{
164 int nd, k; 164 return nodes_intersects(*nodemask, node_states[N_MEMORY]);
165
166 for_each_node_mask(nd, *nodemask) {
167 struct zone *z;
168
169 for (k = 0; k <= policy_zone; k++) {
170 z = &NODE_DATA(nd)->node_zones[k];
171 if (z->present_pages > 0)
172 return 1;
173 }
174 }
175
176 return 0;
177} 165}
178 166
179static inline int mpol_store_user_nodemask(const struct mempolicy *pol) 167static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
@@ -270,7 +258,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
270 struct mempolicy *policy; 258 struct mempolicy *policy;
271 259
272 pr_debug("setting mode %d flags %d nodes[0] %lx\n", 260 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
273 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); 261 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
274 262
275 if (mode == MPOL_DEFAULT) { 263 if (mode == MPOL_DEFAULT) {
276 if (nodes && !nodes_empty(*nodes)) 264 if (nodes && !nodes_empty(*nodes))
@@ -508,9 +496,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
508 /* 496 /*
509 * vm_normal_page() filters out zero pages, but there might 497 * vm_normal_page() filters out zero pages, but there might
510 * still be PageReserved pages to skip, perhaps in a VDSO. 498 * still be PageReserved pages to skip, perhaps in a VDSO.
511 * And we cannot move PageKsm pages sensibly or safely yet.
512 */ 499 */
513 if (PageReserved(page) || PageKsm(page)) 500 if (PageReserved(page))
514 continue; 501 continue;
515 nid = page_to_nid(page); 502 nid = page_to_nid(page);
516 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 503 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -1027,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1027 1014
1028 if (!list_empty(&pagelist)) { 1015 if (!list_empty(&pagelist)) {
1029 err = migrate_pages(&pagelist, new_node_page, dest, 1016 err = migrate_pages(&pagelist, new_node_page, dest,
1030 false, MIGRATE_SYNC, 1017 MIGRATE_SYNC, MR_SYSCALL);
1031 MR_SYSCALL);
1032 if (err) 1018 if (err)
1033 putback_lru_pages(&pagelist); 1019 putback_lru_pages(&pagelist);
1034 } 1020 }
@@ -1235,7 +1221,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1235 1221
1236 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", 1222 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1237 start, start + len, mode, mode_flags, 1223 start, start + len, mode, mode_flags,
1238 nmask ? nodes_addr(*nmask)[0] : -1); 1224 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1239 1225
1240 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 1226 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1241 1227
@@ -1272,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len,
1272 if (!list_empty(&pagelist)) { 1258 if (!list_empty(&pagelist)) {
1273 WARN_ON_ONCE(flags & MPOL_MF_LAZY); 1259 WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1274 nr_failed = migrate_pages(&pagelist, new_vma_page, 1260 nr_failed = migrate_pages(&pagelist, new_vma_page,
1275 (unsigned long)vma, 1261 (unsigned long)vma,
1276 false, MIGRATE_SYNC, 1262 MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1277 MR_MEMPOLICY_MBIND);
1278 if (nr_failed) 1263 if (nr_failed)
1279 putback_lru_pages(&pagelist); 1264 putback_lru_pages(&pagelist);
1280 } 1265 }
@@ -1644,6 +1629,26 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
1644 return pol; 1629 return pol;
1645} 1630}
1646 1631
1632static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1633{
1634 enum zone_type dynamic_policy_zone = policy_zone;
1635
1636 BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1637
1638 /*
1639 * if policy->v.nodes has movable memory only,
1640 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1641 *
1642 * policy->v.nodes is intersect with node_states[N_MEMORY].
1643 * so if the following test faile, it implies
1644 * policy->v.nodes has movable memory only.
1645 */
1646 if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1647 dynamic_policy_zone = ZONE_MOVABLE;
1648
1649 return zone >= dynamic_policy_zone;
1650}
1651
1647/* 1652/*
1648 * Return a nodemask representing a mempolicy for filtering nodes for 1653 * Return a nodemask representing a mempolicy for filtering nodes for
1649 * page allocation 1654 * page allocation
@@ -1652,7 +1657,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1652{ 1657{
1653 /* Lower zones don't get a nodemask applied for MPOL_BIND */ 1658 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1654 if (unlikely(policy->mode == MPOL_BIND) && 1659 if (unlikely(policy->mode == MPOL_BIND) &&
1655 gfp_zone(gfp) >= policy_zone && 1660 apply_policy_zone(policy, gfp_zone(gfp)) &&
1656 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) 1661 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1657 return &policy->v.nodes; 1662 return &policy->v.nodes;
1658 1663
@@ -2308,7 +2313,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
2308 * it less likely we act on an unlikely task<->page 2313 * it less likely we act on an unlikely task<->page
2309 * relation. 2314 * relation.
2310 */ 2315 */
2311 last_nid = page_xchg_last_nid(page, polnid); 2316 last_nid = page_nid_xchg_last(page, polnid);
2312 if (last_nid != polnid) 2317 if (last_nid != polnid)
2313 goto out; 2318 goto out;
2314 } 2319 }
@@ -2483,7 +2488,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
2483 vma->vm_pgoff, 2488 vma->vm_pgoff,
2484 sz, npol ? npol->mode : -1, 2489 sz, npol ? npol->mode : -1,
2485 npol ? npol->flags : -1, 2490 npol ? npol->flags : -1,
2486 npol ? nodes_addr(npol->v.nodes)[0] : -1); 2491 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2487 2492
2488 if (npol) { 2493 if (npol) {
2489 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 2494 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
diff --git a/mm/migrate.c b/mm/migrate.c
index 2fd8b4af4744..3bbaf5d230b0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -464,7 +464,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
464 464
465 mlock_migrate_page(newpage, page); 465 mlock_migrate_page(newpage, page);
466 ksm_migrate_page(newpage, page); 466 ksm_migrate_page(newpage, page);
467 467 /*
468 * Please do not reorder this without considering how mm/ksm.c's
469 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
470 */
468 ClearPageSwapCache(page); 471 ClearPageSwapCache(page);
469 ClearPagePrivate(page); 472 ClearPagePrivate(page);
470 set_page_private(page, 0); 473 set_page_private(page, 0);
@@ -698,7 +701,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
698} 701}
699 702
700static int __unmap_and_move(struct page *page, struct page *newpage, 703static int __unmap_and_move(struct page *page, struct page *newpage,
701 int force, bool offlining, enum migrate_mode mode) 704 int force, enum migrate_mode mode)
702{ 705{
703 int rc = -EAGAIN; 706 int rc = -EAGAIN;
704 int remap_swapcache = 1; 707 int remap_swapcache = 1;
@@ -728,20 +731,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
728 lock_page(page); 731 lock_page(page);
729 } 732 }
730 733
731 /*
732 * Only memory hotplug's offline_pages() caller has locked out KSM,
733 * and can safely migrate a KSM page. The other cases have skipped
734 * PageKsm along with PageReserved - but it is only now when we have
735 * the page lock that we can be certain it will not go KSM beneath us
736 * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
737 * its pagecount raised, but only here do we take the page lock which
738 * serializes that).
739 */
740 if (PageKsm(page) && !offlining) {
741 rc = -EBUSY;
742 goto unlock;
743 }
744
745 /* charge against new page */ 734 /* charge against new page */
746 mem_cgroup_prepare_migration(page, newpage, &mem); 735 mem_cgroup_prepare_migration(page, newpage, &mem);
747 736
@@ -768,7 +757,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
768 * File Caches may use write_page() or lock_page() in migration, then, 757 * File Caches may use write_page() or lock_page() in migration, then,
769 * just care Anon page here. 758 * just care Anon page here.
770 */ 759 */
771 if (PageAnon(page)) { 760 if (PageAnon(page) && !PageKsm(page)) {
772 /* 761 /*
773 * Only page_lock_anon_vma_read() understands the subtleties of 762 * Only page_lock_anon_vma_read() understands the subtleties of
774 * getting a hold on an anon_vma from outside one of its mms. 763 * getting a hold on an anon_vma from outside one of its mms.
@@ -848,7 +837,6 @@ uncharge:
848 mem_cgroup_end_migration(mem, page, newpage, 837 mem_cgroup_end_migration(mem, page, newpage,
849 (rc == MIGRATEPAGE_SUCCESS || 838 (rc == MIGRATEPAGE_SUCCESS ||
850 rc == MIGRATEPAGE_BALLOON_SUCCESS)); 839 rc == MIGRATEPAGE_BALLOON_SUCCESS));
851unlock:
852 unlock_page(page); 840 unlock_page(page);
853out: 841out:
854 return rc; 842 return rc;
@@ -859,8 +847,7 @@ out:
859 * to the newly allocated page in newpage. 847 * to the newly allocated page in newpage.
860 */ 848 */
861static int unmap_and_move(new_page_t get_new_page, unsigned long private, 849static int unmap_and_move(new_page_t get_new_page, unsigned long private,
862 struct page *page, int force, bool offlining, 850 struct page *page, int force, enum migrate_mode mode)
863 enum migrate_mode mode)
864{ 851{
865 int rc = 0; 852 int rc = 0;
866 int *result = NULL; 853 int *result = NULL;
@@ -878,7 +865,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
878 if (unlikely(split_huge_page(page))) 865 if (unlikely(split_huge_page(page)))
879 goto out; 866 goto out;
880 867
881 rc = __unmap_and_move(page, newpage, force, offlining, mode); 868 rc = __unmap_and_move(page, newpage, force, mode);
882 869
883 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { 870 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
884 /* 871 /*
@@ -938,8 +925,7 @@ out:
938 */ 925 */
939static int unmap_and_move_huge_page(new_page_t get_new_page, 926static int unmap_and_move_huge_page(new_page_t get_new_page,
940 unsigned long private, struct page *hpage, 927 unsigned long private, struct page *hpage,
941 int force, bool offlining, 928 int force, enum migrate_mode mode)
942 enum migrate_mode mode)
943{ 929{
944 int rc = 0; 930 int rc = 0;
945 int *result = NULL; 931 int *result = NULL;
@@ -1001,9 +987,8 @@ out:
1001 * 987 *
1002 * Return: Number of pages not migrated or error code. 988 * Return: Number of pages not migrated or error code.
1003 */ 989 */
1004int migrate_pages(struct list_head *from, 990int migrate_pages(struct list_head *from, new_page_t get_new_page,
1005 new_page_t get_new_page, unsigned long private, bool offlining, 991 unsigned long private, enum migrate_mode mode, int reason)
1006 enum migrate_mode mode, int reason)
1007{ 992{
1008 int retry = 1; 993 int retry = 1;
1009 int nr_failed = 0; 994 int nr_failed = 0;
@@ -1024,8 +1009,7 @@ int migrate_pages(struct list_head *from,
1024 cond_resched(); 1009 cond_resched();
1025 1010
1026 rc = unmap_and_move(get_new_page, private, 1011 rc = unmap_and_move(get_new_page, private,
1027 page, pass > 2, offlining, 1012 page, pass > 2, mode);
1028 mode);
1029 1013
1030 switch(rc) { 1014 switch(rc) {
1031 case -ENOMEM: 1015 case -ENOMEM:
@@ -1058,15 +1042,13 @@ out:
1058} 1042}
1059 1043
1060int migrate_huge_page(struct page *hpage, new_page_t get_new_page, 1044int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1061 unsigned long private, bool offlining, 1045 unsigned long private, enum migrate_mode mode)
1062 enum migrate_mode mode)
1063{ 1046{
1064 int pass, rc; 1047 int pass, rc;
1065 1048
1066 for (pass = 0; pass < 10; pass++) { 1049 for (pass = 0; pass < 10; pass++) {
1067 rc = unmap_and_move_huge_page(get_new_page, 1050 rc = unmap_and_move_huge_page(get_new_page, private,
1068 private, hpage, pass > 2, offlining, 1051 hpage, pass > 2, mode);
1069 mode);
1070 switch (rc) { 1052 switch (rc) {
1071 case -ENOMEM: 1053 case -ENOMEM:
1072 goto out; 1054 goto out;
@@ -1152,7 +1134,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1152 goto set_status; 1134 goto set_status;
1153 1135
1154 /* Use PageReserved to check for zero page */ 1136 /* Use PageReserved to check for zero page */
1155 if (PageReserved(page) || PageKsm(page)) 1137 if (PageReserved(page))
1156 goto put_and_set; 1138 goto put_and_set;
1157 1139
1158 pp->page = page; 1140 pp->page = page;
@@ -1189,8 +1171,7 @@ set_status:
1189 err = 0; 1171 err = 0;
1190 if (!list_empty(&pagelist)) { 1172 if (!list_empty(&pagelist)) {
1191 err = migrate_pages(&pagelist, new_page_node, 1173 err = migrate_pages(&pagelist, new_page_node,
1192 (unsigned long)pm, 0, MIGRATE_SYNC, 1174 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1193 MR_SYSCALL);
1194 if (err) 1175 if (err)
1195 putback_lru_pages(&pagelist); 1176 putback_lru_pages(&pagelist);
1196 } 1177 }
@@ -1314,7 +1295,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
1314 1295
1315 err = -ENOENT; 1296 err = -ENOENT;
1316 /* Use PageReserved to check for zero page */ 1297 /* Use PageReserved to check for zero page */
1317 if (!page || PageReserved(page) || PageKsm(page)) 1298 if (!page || PageReserved(page))
1318 goto set_status; 1299 goto set_status;
1319 1300
1320 err = page_to_nid(page); 1301 err = page_to_nid(page);
@@ -1461,7 +1442,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
1461 * pages. Currently it only checks the watermarks which crude 1442 * pages. Currently it only checks the watermarks which crude
1462 */ 1443 */
1463static bool migrate_balanced_pgdat(struct pglist_data *pgdat, 1444static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1464 int nr_migrate_pages) 1445 unsigned long nr_migrate_pages)
1465{ 1446{
1466 int z; 1447 int z;
1467 for (z = pgdat->nr_zones - 1; z >= 0; z--) { 1448 for (z = pgdat->nr_zones - 1; z >= 0; z--) {
@@ -1497,7 +1478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
1497 __GFP_NOWARN) & 1478 __GFP_NOWARN) &
1498 ~GFP_IOFS, 0); 1479 ~GFP_IOFS, 0);
1499 if (newpage) 1480 if (newpage)
1500 page_xchg_last_nid(newpage, page_last_nid(page)); 1481 page_nid_xchg_last(newpage, page_nid_last(page));
1501 1482
1502 return newpage; 1483 return newpage;
1503} 1484}
@@ -1557,39 +1538,40 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
1557 1538
1558int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) 1539int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1559{ 1540{
1560 int ret = 0; 1541 int page_lru;
1542
1543 VM_BUG_ON(compound_order(page) && !PageTransHuge(page));
1561 1544
1562 /* Avoid migrating to a node that is nearly full */ 1545 /* Avoid migrating to a node that is nearly full */
1563 if (migrate_balanced_pgdat(pgdat, 1)) { 1546 if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
1564 int page_lru; 1547 return 0;
1565 1548
1566 if (isolate_lru_page(page)) { 1549 if (isolate_lru_page(page))
1567 put_page(page); 1550 return 0;
1568 return 0;
1569 }
1570 1551
1571 /* Page is isolated */ 1552 /*
1572 ret = 1; 1553 * migrate_misplaced_transhuge_page() skips page migration's usual
1573 page_lru = page_is_file_cache(page); 1554 * check on page_count(), so we must do it here, now that the page
1574 if (!PageTransHuge(page)) 1555 * has been isolated: a GUP pin, or any other pin, prevents migration.
1575 inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); 1556 * The expected page count is 3: 1 for page's mapcount and 1 for the
1576 else 1557 * caller's pin and 1 for the reference taken by isolate_lru_page().
1577 mod_zone_page_state(page_zone(page), 1558 */
1578 NR_ISOLATED_ANON + page_lru, 1559 if (PageTransHuge(page) && page_count(page) != 3) {
1579 HPAGE_PMD_NR); 1560 putback_lru_page(page);
1561 return 0;
1580 } 1562 }
1581 1563
1564 page_lru = page_is_file_cache(page);
1565 mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru,
1566 hpage_nr_pages(page));
1567
1582 /* 1568 /*
1583 * Page is either isolated or there is not enough space on the target 1569 * Isolating the page has taken another reference, so the
1584 * node. If isolated, then it has taken a reference count and the 1570 * caller's reference can be safely dropped without the page
1585 * callers reference can be safely dropped without the page 1571 * disappearing underneath us during migration.
1586 * disappearing underneath us during migration. Otherwise the page is
1587 * not to be migrated but the callers reference should still be
1588 * dropped so it does not leak.
1589 */ 1572 */
1590 put_page(page); 1573 put_page(page);
1591 1574 return 1;
1592 return ret;
1593} 1575}
1594 1576
1595/* 1577/*
@@ -1600,7 +1582,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1600int migrate_misplaced_page(struct page *page, int node) 1582int migrate_misplaced_page(struct page *page, int node)
1601{ 1583{
1602 pg_data_t *pgdat = NODE_DATA(node); 1584 pg_data_t *pgdat = NODE_DATA(node);
1603 int isolated = 0; 1585 int isolated;
1604 int nr_remaining; 1586 int nr_remaining;
1605 LIST_HEAD(migratepages); 1587 LIST_HEAD(migratepages);
1606 1588
@@ -1608,42 +1590,43 @@ int migrate_misplaced_page(struct page *page, int node)
1608 * Don't migrate pages that are mapped in multiple processes. 1590 * Don't migrate pages that are mapped in multiple processes.
1609 * TODO: Handle false sharing detection instead of this hammer 1591 * TODO: Handle false sharing detection instead of this hammer
1610 */ 1592 */
1611 if (page_mapcount(page) != 1) { 1593 if (page_mapcount(page) != 1)
1612 put_page(page);
1613 goto out; 1594 goto out;
1614 }
1615 1595
1616 /* 1596 /*
1617 * Rate-limit the amount of data that is being migrated to a node. 1597 * Rate-limit the amount of data that is being migrated to a node.
1618 * Optimal placement is no good if the memory bus is saturated and 1598 * Optimal placement is no good if the memory bus is saturated and
1619 * all the time is being spent migrating! 1599 * all the time is being spent migrating!
1620 */ 1600 */
1621 if (numamigrate_update_ratelimit(pgdat, 1)) { 1601 if (numamigrate_update_ratelimit(pgdat, 1))
1622 put_page(page);
1623 goto out; 1602 goto out;
1624 }
1625 1603
1626 isolated = numamigrate_isolate_page(pgdat, page); 1604 isolated = numamigrate_isolate_page(pgdat, page);
1627 if (!isolated) 1605 if (!isolated)
1628 goto out; 1606 goto out;
1629 1607
1630 list_add(&page->lru, &migratepages); 1608 list_add(&page->lru, &migratepages);
1631 nr_remaining = migrate_pages(&migratepages, 1609 nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
1632 alloc_misplaced_dst_page, 1610 node, MIGRATE_ASYNC, MR_NUMA_MISPLACED);
1633 node, false, MIGRATE_ASYNC,
1634 MR_NUMA_MISPLACED);
1635 if (nr_remaining) { 1611 if (nr_remaining) {
1636 putback_lru_pages(&migratepages); 1612 putback_lru_pages(&migratepages);
1637 isolated = 0; 1613 isolated = 0;
1638 } else 1614 } else
1639 count_vm_numa_event(NUMA_PAGE_MIGRATE); 1615 count_vm_numa_event(NUMA_PAGE_MIGRATE);
1640 BUG_ON(!list_empty(&migratepages)); 1616 BUG_ON(!list_empty(&migratepages));
1641out:
1642 return isolated; 1617 return isolated;
1618
1619out:
1620 put_page(page);
1621 return 0;
1643} 1622}
1644#endif /* CONFIG_NUMA_BALANCING */ 1623#endif /* CONFIG_NUMA_BALANCING */
1645 1624
1646#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) 1625#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
1626/*
1627 * Migrates a THP to a given target node. page must be locked and is unlocked
1628 * before returning.
1629 */
1647int migrate_misplaced_transhuge_page(struct mm_struct *mm, 1630int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1648 struct vm_area_struct *vma, 1631 struct vm_area_struct *vma,
1649 pmd_t *pmd, pmd_t entry, 1632 pmd_t *pmd, pmd_t entry,
@@ -1674,29 +1657,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1674 1657
1675 new_page = alloc_pages_node(node, 1658 new_page = alloc_pages_node(node,
1676 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); 1659 (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
1677 if (!new_page) { 1660 if (!new_page)
1678 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1661 goto out_fail;
1679 goto out_dropref;
1680 }
1681 page_xchg_last_nid(new_page, page_last_nid(page));
1682 1662
1683 isolated = numamigrate_isolate_page(pgdat, page); 1663 page_nid_xchg_last(new_page, page_nid_last(page));
1684 1664
1685 /* 1665 isolated = numamigrate_isolate_page(pgdat, page);
1686 * Failing to isolate or a GUP pin prevents migration. The expected 1666 if (!isolated) {
1687 * page count is 2. 1 for anonymous pages without a mapping and 1
1688 * for the callers pin. If the page was isolated, the page will
1689 * need to be put back on the LRU.
1690 */
1691 if (!isolated || page_count(page) != 2) {
1692 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1693 put_page(new_page); 1667 put_page(new_page);
1694 if (isolated) { 1668 goto out_fail;
1695 putback_lru_page(page);
1696 isolated = 0;
1697 goto out;
1698 }
1699 goto out_keep_locked;
1700 } 1669 }
1701 1670
1702 /* Prepare a page as a migration target */ 1671 /* Prepare a page as a migration target */
@@ -1728,6 +1697,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
1728 putback_lru_page(page); 1697 putback_lru_page(page);
1729 1698
1730 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); 1699 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1700 isolated = 0;
1731 goto out; 1701 goto out;
1732 } 1702 }
1733 1703
@@ -1772,9 +1742,11 @@ out:
1772 -HPAGE_PMD_NR); 1742 -HPAGE_PMD_NR);
1773 return isolated; 1743 return isolated;
1774 1744
1745out_fail:
1746 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1775out_dropref: 1747out_dropref:
1748 unlock_page(page);
1776 put_page(page); 1749 put_page(page);
1777out_keep_locked:
1778 return 0; 1750 return 0;
1779} 1751}
1780#endif /* CONFIG_NUMA_BALANCING */ 1752#endif /* CONFIG_NUMA_BALANCING */
diff --git a/mm/mincore.c b/mm/mincore.c
index 936b4cee8cb1..da2be56a7b8f 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
75 /* shmem/tmpfs may return swap: account for swapcache page too. */ 75 /* shmem/tmpfs may return swap: account for swapcache page too. */
76 if (radix_tree_exceptional_entry(page)) { 76 if (radix_tree_exceptional_entry(page)) {
77 swp_entry_t swap = radix_to_swp_entry(page); 77 swp_entry_t swap = radix_to_swp_entry(page);
78 page = find_get_page(&swapper_space, swap.val); 78 page = find_get_page(swap_address_space(swap), swap.val);
79 } 79 }
80#endif 80#endif
81 if (page) { 81 if (page) {
@@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
135 } else { 135 } else {
136#ifdef CONFIG_SWAP 136#ifdef CONFIG_SWAP
137 pgoff = entry.val; 137 pgoff = entry.val;
138 *vec = mincore_page(&swapper_space, pgoff); 138 *vec = mincore_page(swap_address_space(entry),
139 pgoff);
139#else 140#else
140 WARN_ON(1); 141 WARN_ON(1);
141 *vec = 1; 142 *vec = 1;
diff --git a/mm/mlock.c b/mm/mlock.c
index c9bd528b01d2..e6638f565d42 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -155,13 +155,12 @@ void munlock_vma_page(struct page *page)
155 * 155 *
156 * vma->vm_mm->mmap_sem must be held for at least read. 156 * vma->vm_mm->mmap_sem must be held for at least read.
157 */ 157 */
158static long __mlock_vma_pages_range(struct vm_area_struct *vma, 158long __mlock_vma_pages_range(struct vm_area_struct *vma,
159 unsigned long start, unsigned long end, 159 unsigned long start, unsigned long end, int *nonblocking)
160 int *nonblocking)
161{ 160{
162 struct mm_struct *mm = vma->vm_mm; 161 struct mm_struct *mm = vma->vm_mm;
163 unsigned long addr = start; 162 unsigned long addr = start;
164 int nr_pages = (end - start) / PAGE_SIZE; 163 unsigned long nr_pages = (end - start) / PAGE_SIZE;
165 int gup_flags; 164 int gup_flags;
166 165
167 VM_BUG_ON(start & ~PAGE_MASK); 166 VM_BUG_ON(start & ~PAGE_MASK);
@@ -186,6 +185,10 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
186 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) 185 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
187 gup_flags |= FOLL_FORCE; 186 gup_flags |= FOLL_FORCE;
188 187
188 /*
189 * We made sure addr is within a VMA, so the following will
190 * not result in a stack expansion that recurses back here.
191 */
189 return __get_user_pages(current, mm, addr, nr_pages, gup_flags, 192 return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
190 NULL, NULL, nonblocking); 193 NULL, NULL, nonblocking);
191} 194}
@@ -202,56 +205,6 @@ static int __mlock_posix_error_return(long retval)
202 return retval; 205 return retval;
203} 206}
204 207
205/**
206 * mlock_vma_pages_range() - mlock pages in specified vma range.
207 * @vma - the vma containing the specfied address range
208 * @start - starting address in @vma to mlock
209 * @end - end address [+1] in @vma to mlock
210 *
211 * For mmap()/mremap()/expansion of mlocked vma.
212 *
213 * return 0 on success for "normal" vmas.
214 *
215 * return number of pages [> 0] to be removed from locked_vm on success
216 * of "special" vmas.
217 */
218long mlock_vma_pages_range(struct vm_area_struct *vma,
219 unsigned long start, unsigned long end)
220{
221 int nr_pages = (end - start) / PAGE_SIZE;
222 BUG_ON(!(vma->vm_flags & VM_LOCKED));
223
224 /*
225 * filter unlockable vmas
226 */
227 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
228 goto no_mlock;
229
230 if (!((vma->vm_flags & VM_DONTEXPAND) ||
231 is_vm_hugetlb_page(vma) ||
232 vma == get_gate_vma(current->mm))) {
233
234 __mlock_vma_pages_range(vma, start, end, NULL);
235
236 /* Hide errors from mmap() and other callers */
237 return 0;
238 }
239
240 /*
241 * User mapped kernel pages or huge pages:
242 * make these pages present to populate the ptes, but
243 * fall thru' to reset VM_LOCKED--no need to unlock, and
244 * return nr_pages so these don't get counted against task's
245 * locked limit. huge pages are already counted against
246 * locked vm limit.
247 */
248 make_pages_present(start, end);
249
250no_mlock:
251 vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
252 return nr_pages; /* error or pages NOT mlocked */
253}
254
255/* 208/*
256 * munlock_vma_pages_range() - munlock all pages in the vma range.' 209 * munlock_vma_pages_range() - munlock all pages in the vma range.'
257 * @vma - vma containing range to be munlock()ed. 210 * @vma - vma containing range to be munlock()ed.
@@ -303,7 +256,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
303 * 256 *
304 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and 257 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
305 * munlock is a no-op. However, for some special vmas, we go ahead and 258 * munlock is a no-op. However, for some special vmas, we go ahead and
306 * populate the ptes via make_pages_present(). 259 * populate the ptes.
307 * 260 *
308 * For vmas that pass the filters, merge/split as appropriate. 261 * For vmas that pass the filters, merge/split as appropriate.
309 */ 262 */
@@ -391,9 +344,9 @@ static int do_mlock(unsigned long start, size_t len, int on)
391 344
392 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 345 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
393 346
394 newflags = vma->vm_flags | VM_LOCKED; 347 newflags = vma->vm_flags & ~VM_LOCKED;
395 if (!on) 348 if (on)
396 newflags &= ~VM_LOCKED; 349 newflags |= VM_LOCKED | VM_POPULATE;
397 350
398 tmp = vma->vm_end; 351 tmp = vma->vm_end;
399 if (tmp > end) 352 if (tmp > end)
@@ -416,13 +369,20 @@ static int do_mlock(unsigned long start, size_t len, int on)
416 return error; 369 return error;
417} 370}
418 371
419static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) 372/*
373 * __mm_populate - populate and/or mlock pages within a range of address space.
374 *
375 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
376 * flags. VMAs must be already marked with the desired vm_flags, and
377 * mmap_sem must not be held.
378 */
379int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
420{ 380{
421 struct mm_struct *mm = current->mm; 381 struct mm_struct *mm = current->mm;
422 unsigned long end, nstart, nend; 382 unsigned long end, nstart, nend;
423 struct vm_area_struct *vma = NULL; 383 struct vm_area_struct *vma = NULL;
424 int locked = 0; 384 int locked = 0;
425 int ret = 0; 385 long ret = 0;
426 386
427 VM_BUG_ON(start & ~PAGE_MASK); 387 VM_BUG_ON(start & ~PAGE_MASK);
428 VM_BUG_ON(len != PAGE_ALIGN(len)); 388 VM_BUG_ON(len != PAGE_ALIGN(len));
@@ -446,7 +406,8 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
446 * range with the first VMA. Also, skip undesirable VMA types. 406 * range with the first VMA. Also, skip undesirable VMA types.
447 */ 407 */
448 nend = min(end, vma->vm_end); 408 nend = min(end, vma->vm_end);
449 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 409 if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) !=
410 VM_POPULATE)
450 continue; 411 continue;
451 if (nstart < vma->vm_start) 412 if (nstart < vma->vm_start)
452 nstart = vma->vm_start; 413 nstart = vma->vm_start;
@@ -498,7 +459,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
498 error = do_mlock(start, len, 1); 459 error = do_mlock(start, len, 1);
499 up_write(&current->mm->mmap_sem); 460 up_write(&current->mm->mmap_sem);
500 if (!error) 461 if (!error)
501 error = do_mlock_pages(start, len, 0); 462 error = __mm_populate(start, len, 0);
502 return error; 463 return error;
503} 464}
504 465
@@ -519,18 +480,18 @@ static int do_mlockall(int flags)
519 struct vm_area_struct * vma, * prev = NULL; 480 struct vm_area_struct * vma, * prev = NULL;
520 481
521 if (flags & MCL_FUTURE) 482 if (flags & MCL_FUTURE)
522 current->mm->def_flags |= VM_LOCKED; 483 current->mm->def_flags |= VM_LOCKED | VM_POPULATE;
523 else 484 else
524 current->mm->def_flags &= ~VM_LOCKED; 485 current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE);
525 if (flags == MCL_FUTURE) 486 if (flags == MCL_FUTURE)
526 goto out; 487 goto out;
527 488
528 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { 489 for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
529 vm_flags_t newflags; 490 vm_flags_t newflags;
530 491
531 newflags = vma->vm_flags | VM_LOCKED; 492 newflags = vma->vm_flags & ~VM_LOCKED;
532 if (!(flags & MCL_CURRENT)) 493 if (flags & MCL_CURRENT)
533 newflags &= ~VM_LOCKED; 494 newflags |= VM_LOCKED | VM_POPULATE;
534 495
535 /* Ignore errors */ 496 /* Ignore errors */
536 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); 497 mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
@@ -564,10 +525,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
564 capable(CAP_IPC_LOCK)) 525 capable(CAP_IPC_LOCK))
565 ret = do_mlockall(flags); 526 ret = do_mlockall(flags);
566 up_write(&current->mm->mmap_sem); 527 up_write(&current->mm->mmap_sem);
567 if (!ret && (flags & MCL_CURRENT)) { 528 if (!ret && (flags & MCL_CURRENT))
568 /* Ignore errors */ 529 mm_populate(0, TASK_SIZE);
569 do_mlock_pages(0, TASK_SIZE, 1);
570 }
571out: 530out:
572 return ret; 531 return ret;
573} 532}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 1ffd97ae26d7..c280a02ea11e 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -69,34 +69,41 @@ void __init mminit_verify_pageflags_layout(void)
69 unsigned long or_mask, add_mask; 69 unsigned long or_mask, add_mask;
70 70
71 shift = 8 * sizeof(unsigned long); 71 shift = 8 * sizeof(unsigned long);
72 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; 72 width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT;
73 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", 73 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
74 "Section %d Node %d Zone %d Flags %d\n", 74 "Section %d Node %d Zone %d Lastnid %d Flags %d\n",
75 SECTIONS_WIDTH, 75 SECTIONS_WIDTH,
76 NODES_WIDTH, 76 NODES_WIDTH,
77 ZONES_WIDTH, 77 ZONES_WIDTH,
78 LAST_NID_WIDTH,
78 NR_PAGEFLAGS); 79 NR_PAGEFLAGS);
79 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", 80 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
80 "Section %d Node %d Zone %d\n", 81 "Section %d Node %d Zone %d Lastnid %d\n",
81 SECTIONS_SHIFT, 82 SECTIONS_SHIFT,
82 NODES_SHIFT, 83 NODES_SHIFT,
83 ZONES_SHIFT); 84 ZONES_SHIFT,
84 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", 85 LAST_NID_SHIFT);
85 "Section %lu Node %lu Zone %lu\n", 86 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
87 "Section %lu Node %lu Zone %lu Lastnid %lu\n",
86 (unsigned long)SECTIONS_PGSHIFT, 88 (unsigned long)SECTIONS_PGSHIFT,
87 (unsigned long)NODES_PGSHIFT, 89 (unsigned long)NODES_PGSHIFT,
88 (unsigned long)ZONES_PGSHIFT); 90 (unsigned long)ZONES_PGSHIFT,
89 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", 91 (unsigned long)LAST_NID_PGSHIFT);
90 "Zone ID: %lu -> %lu\n", 92 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
91 (unsigned long)ZONEID_PGOFF, 93 "Node/Zone ID: %lu -> %lu\n",
92 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); 94 (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
95 (unsigned long)ZONEID_PGOFF);
93 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", 96 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
94 "location: %d -> %d unused %d -> %d flags %d -> %d\n", 97 "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
95 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); 98 shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
96#ifdef NODE_NOT_IN_PAGE_FLAGS 99#ifdef NODE_NOT_IN_PAGE_FLAGS
97 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", 100 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
98 "Node not in page flags"); 101 "Node not in page flags");
99#endif 102#endif
103#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
104 mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
105 "Last nid not in page flags");
106#endif
100 107
101 if (SECTIONS_WIDTH) { 108 if (SECTIONS_WIDTH) {
102 shift -= SECTIONS_WIDTH; 109 shift -= SECTIONS_WIDTH;
diff --git a/mm/mmap.c b/mm/mmap.c
index 09da0b264982..318e121affda 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -144,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
144 */ 144 */
145 free -= global_page_state(NR_SHMEM); 145 free -= global_page_state(NR_SHMEM);
146 146
147 free += nr_swap_pages; 147 free += get_nr_swap_pages();
148 148
149 /* 149 /*
150 * Any slabs which are created with the 150 * Any slabs which are created with the
@@ -256,6 +256,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
256 unsigned long newbrk, oldbrk; 256 unsigned long newbrk, oldbrk;
257 struct mm_struct *mm = current->mm; 257 struct mm_struct *mm = current->mm;
258 unsigned long min_brk; 258 unsigned long min_brk;
259 bool populate;
259 260
260 down_write(&mm->mmap_sem); 261 down_write(&mm->mmap_sem);
261 262
@@ -305,8 +306,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
305 /* Ok, looks good - let it rip. */ 306 /* Ok, looks good - let it rip. */
306 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) 307 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
307 goto out; 308 goto out;
309
308set_brk: 310set_brk:
309 mm->brk = brk; 311 mm->brk = brk;
312 populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
313 up_write(&mm->mmap_sem);
314 if (populate)
315 mm_populate(oldbrk, newbrk - oldbrk);
316 return brk;
317
310out: 318out:
311 retval = mm->brk; 319 retval = mm->brk;
312 up_write(&mm->mmap_sem); 320 up_write(&mm->mmap_sem);
@@ -801,7 +809,7 @@ again: remove_next = 1 + (end > next->vm_end);
801 anon_vma_interval_tree_post_update_vma(vma); 809 anon_vma_interval_tree_post_update_vma(vma);
802 if (adjust_next) 810 if (adjust_next)
803 anon_vma_interval_tree_post_update_vma(next); 811 anon_vma_interval_tree_post_update_vma(next);
804 anon_vma_unlock(anon_vma); 812 anon_vma_unlock_write(anon_vma);
805 } 813 }
806 if (mapping) 814 if (mapping)
807 mutex_unlock(&mapping->i_mmap_mutex); 815 mutex_unlock(&mapping->i_mmap_mutex);
@@ -1154,12 +1162,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
1154 1162
1155unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 1163unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1156 unsigned long len, unsigned long prot, 1164 unsigned long len, unsigned long prot,
1157 unsigned long flags, unsigned long pgoff) 1165 unsigned long flags, unsigned long pgoff,
1166 unsigned long *populate)
1158{ 1167{
1159 struct mm_struct * mm = current->mm; 1168 struct mm_struct * mm = current->mm;
1160 struct inode *inode; 1169 struct inode *inode;
1161 vm_flags_t vm_flags; 1170 vm_flags_t vm_flags;
1162 1171
1172 *populate = 0;
1173
1163 /* 1174 /*
1164 * Does the application expect PROT_READ to imply PROT_EXEC? 1175 * Does the application expect PROT_READ to imply PROT_EXEC?
1165 * 1176 *
@@ -1280,7 +1291,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1280 } 1291 }
1281 } 1292 }
1282 1293
1283 return mmap_region(file, addr, len, flags, vm_flags, pgoff); 1294 /*
1295 * Set 'VM_NORESERVE' if we should not account for the
1296 * memory use of this mapping.
1297 */
1298 if (flags & MAP_NORESERVE) {
1299 /* We honor MAP_NORESERVE if allowed to overcommit */
1300 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1301 vm_flags |= VM_NORESERVE;
1302
1303 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1304 if (file && is_file_hugepages(file))
1305 vm_flags |= VM_NORESERVE;
1306 }
1307
1308 addr = mmap_region(file, addr, len, vm_flags, pgoff);
1309 if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE))
1310 *populate = len;
1311 return addr;
1284} 1312}
1285 1313
1286SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1314SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
@@ -1395,8 +1423,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
1395} 1423}
1396 1424
1397unsigned long mmap_region(struct file *file, unsigned long addr, 1425unsigned long mmap_region(struct file *file, unsigned long addr,
1398 unsigned long len, unsigned long flags, 1426 unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
1399 vm_flags_t vm_flags, unsigned long pgoff)
1400{ 1427{
1401 struct mm_struct *mm = current->mm; 1428 struct mm_struct *mm = current->mm;
1402 struct vm_area_struct *vma, *prev; 1429 struct vm_area_struct *vma, *prev;
@@ -1420,20 +1447,6 @@ munmap_back:
1420 return -ENOMEM; 1447 return -ENOMEM;
1421 1448
1422 /* 1449 /*
1423 * Set 'VM_NORESERVE' if we should not account for the
1424 * memory use of this mapping.
1425 */
1426 if ((flags & MAP_NORESERVE)) {
1427 /* We honor MAP_NORESERVE if allowed to overcommit */
1428 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1429 vm_flags |= VM_NORESERVE;
1430
1431 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1432 if (file && is_file_hugepages(file))
1433 vm_flags |= VM_NORESERVE;
1434 }
1435
1436 /*
1437 * Private writable mapping: check memory availability 1450 * Private writable mapping: check memory availability
1438 */ 1451 */
1439 if (accountable_mapping(file, vm_flags)) { 1452 if (accountable_mapping(file, vm_flags)) {
@@ -1531,10 +1544,12 @@ out:
1531 1544
1532 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1545 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1533 if (vm_flags & VM_LOCKED) { 1546 if (vm_flags & VM_LOCKED) {
1534 if (!mlock_vma_pages_range(vma, addr, addr + len)) 1547 if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
1548 vma == get_gate_vma(current->mm)))
1535 mm->locked_vm += (len >> PAGE_SHIFT); 1549 mm->locked_vm += (len >> PAGE_SHIFT);
1536 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1550 else
1537 make_pages_present(addr, addr + len); 1551 vma->vm_flags &= ~VM_LOCKED;
1552 }
1538 1553
1539 if (file) 1554 if (file)
1540 uprobe_mmap(vma); 1555 uprobe_mmap(vma);
@@ -2187,9 +2202,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
2187 return vma; 2202 return vma;
2188 if (!prev || expand_stack(prev, addr)) 2203 if (!prev || expand_stack(prev, addr))
2189 return NULL; 2204 return NULL;
2190 if (prev->vm_flags & VM_LOCKED) { 2205 if (prev->vm_flags & VM_LOCKED)
2191 mlock_vma_pages_range(prev, addr, prev->vm_end); 2206 __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
2192 }
2193 return prev; 2207 return prev;
2194} 2208}
2195#else 2209#else
@@ -2215,9 +2229,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
2215 start = vma->vm_start; 2229 start = vma->vm_start;
2216 if (expand_stack(vma, addr)) 2230 if (expand_stack(vma, addr))
2217 return NULL; 2231 return NULL;
2218 if (vma->vm_flags & VM_LOCKED) { 2232 if (vma->vm_flags & VM_LOCKED)
2219 mlock_vma_pages_range(vma, addr, start); 2233 __mlock_vma_pages_range(vma, addr, start, NULL);
2220 }
2221 return vma; 2234 return vma;
2222} 2235}
2223#endif 2236#endif
@@ -2590,10 +2603,8 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2590out: 2603out:
2591 perf_event_mmap(vma); 2604 perf_event_mmap(vma);
2592 mm->total_vm += len >> PAGE_SHIFT; 2605 mm->total_vm += len >> PAGE_SHIFT;
2593 if (flags & VM_LOCKED) { 2606 if (flags & VM_LOCKED)
2594 if (!mlock_vma_pages_range(vma, addr, addr + len)) 2607 mm->locked_vm += (len >> PAGE_SHIFT);
2595 mm->locked_vm += (len >> PAGE_SHIFT);
2596 }
2597 return addr; 2608 return addr;
2598} 2609}
2599 2610
@@ -2601,10 +2612,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
2601{ 2612{
2602 struct mm_struct *mm = current->mm; 2613 struct mm_struct *mm = current->mm;
2603 unsigned long ret; 2614 unsigned long ret;
2615 bool populate;
2604 2616
2605 down_write(&mm->mmap_sem); 2617 down_write(&mm->mmap_sem);
2606 ret = do_brk(addr, len); 2618 ret = do_brk(addr, len);
2619 populate = ((mm->def_flags & VM_LOCKED) != 0);
2607 up_write(&mm->mmap_sem); 2620 up_write(&mm->mmap_sem);
2621 if (populate)
2622 mm_populate(addr, len);
2608 return ret; 2623 return ret;
2609} 2624}
2610EXPORT_SYMBOL(vm_brk); 2625EXPORT_SYMBOL(vm_brk);
@@ -3002,7 +3017,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
3002 if (!__test_and_clear_bit(0, (unsigned long *) 3017 if (!__test_and_clear_bit(0, (unsigned long *)
3003 &anon_vma->root->rb_root.rb_node)) 3018 &anon_vma->root->rb_root.rb_node))
3004 BUG(); 3019 BUG();
3005 anon_vma_unlock(anon_vma); 3020 anon_vma_unlock_write(anon_vma);
3006 } 3021 }
3007} 3022}
3008 3023
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8a5ac8c686b0..2175fb0d501c 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -37,49 +37,51 @@ static struct srcu_struct srcu;
37void __mmu_notifier_release(struct mm_struct *mm) 37void __mmu_notifier_release(struct mm_struct *mm)
38{ 38{
39 struct mmu_notifier *mn; 39 struct mmu_notifier *mn;
40 struct hlist_node *n;
41 int id; 40 int id;
42 41
43 /* 42 /*
44 * SRCU here will block mmu_notifier_unregister until 43 * srcu_read_lock() here will block synchronize_srcu() in
45 * ->release returns. 44 * mmu_notifier_unregister() until all registered
45 * ->release() callouts this function makes have
46 * returned.
46 */ 47 */
47 id = srcu_read_lock(&srcu); 48 id = srcu_read_lock(&srcu);
48 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
49 /*
50 * if ->release runs before mmu_notifier_unregister it
51 * must be handled as it's the only way for the driver
52 * to flush all existing sptes and stop the driver
53 * from establishing any more sptes before all the
54 * pages in the mm are freed.
55 */
56 if (mn->ops->release)
57 mn->ops->release(mn, mm);
58 srcu_read_unlock(&srcu, id);
59
60 spin_lock(&mm->mmu_notifier_mm->lock); 49 spin_lock(&mm->mmu_notifier_mm->lock);
61 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 50 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
62 mn = hlist_entry(mm->mmu_notifier_mm->list.first, 51 mn = hlist_entry(mm->mmu_notifier_mm->list.first,
63 struct mmu_notifier, 52 struct mmu_notifier,
64 hlist); 53 hlist);
54
65 /* 55 /*
66 * We arrived before mmu_notifier_unregister so 56 * Unlink. This will prevent mmu_notifier_unregister()
67 * mmu_notifier_unregister will do nothing other than 57 * from also making the ->release() callout.
68 * to wait ->release to finish and
69 * mmu_notifier_unregister to return.
70 */ 58 */
71 hlist_del_init_rcu(&mn->hlist); 59 hlist_del_init_rcu(&mn->hlist);
60 spin_unlock(&mm->mmu_notifier_mm->lock);
61
62 /*
63 * Clear sptes. (see 'release' description in mmu_notifier.h)
64 */
65 if (mn->ops->release)
66 mn->ops->release(mn, mm);
67
68 spin_lock(&mm->mmu_notifier_mm->lock);
72 } 69 }
73 spin_unlock(&mm->mmu_notifier_mm->lock); 70 spin_unlock(&mm->mmu_notifier_mm->lock);
74 71
75 /* 72 /*
76 * synchronize_srcu here prevents mmu_notifier_release to 73 * All callouts to ->release() which we have done are complete.
77 * return to exit_mmap (which would proceed freeing all pages 74 * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
78 * in the mm) until the ->release method returns, if it was 75 */
79 * invoked by mmu_notifier_unregister. 76 srcu_read_unlock(&srcu, id);
80 * 77
81 * The mmu_notifier_mm can't go away from under us because one 78 /*
82 * mm_count is hold by exit_mmap. 79 * mmu_notifier_unregister() may have unlinked a notifier and may
80 * still be calling out to it. Additionally, other notifiers
81 * may have been active via vmtruncate() et. al. Block here
82 * to ensure that all notifier callouts for this mm have been
83 * completed and the sptes are really cleaned up before returning
84 * to exit_mmap().
83 */ 85 */
84 synchronize_srcu(&srcu); 86 synchronize_srcu(&srcu);
85} 87}
@@ -170,6 +172,7 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
170 } 172 }
171 srcu_read_unlock(&srcu, id); 173 srcu_read_unlock(&srcu, id);
172} 174}
175EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
173 176
174void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, 177void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
175 unsigned long start, unsigned long end) 178 unsigned long start, unsigned long end)
@@ -185,6 +188,7 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
185 } 188 }
186 srcu_read_unlock(&srcu, id); 189 srcu_read_unlock(&srcu, id);
187} 190}
191EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
188 192
189static int do_mmu_notifier_register(struct mmu_notifier *mn, 193static int do_mmu_notifier_register(struct mmu_notifier *mn,
190 struct mm_struct *mm, 194 struct mm_struct *mm,
@@ -294,31 +298,31 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
294{ 298{
295 BUG_ON(atomic_read(&mm->mm_count) <= 0); 299 BUG_ON(atomic_read(&mm->mm_count) <= 0);
296 300
301 spin_lock(&mm->mmu_notifier_mm->lock);
297 if (!hlist_unhashed(&mn->hlist)) { 302 if (!hlist_unhashed(&mn->hlist)) {
298 /*
299 * SRCU here will force exit_mmap to wait ->release to finish
300 * before freeing the pages.
301 */
302 int id; 303 int id;
303 304
304 id = srcu_read_lock(&srcu);
305 /* 305 /*
306 * exit_mmap will block in mmu_notifier_release to 306 * Ensure we synchronize up with __mmu_notifier_release().
307 * guarantee ->release is called before freeing the
308 * pages.
309 */ 307 */
308 id = srcu_read_lock(&srcu);
309
310 hlist_del_rcu(&mn->hlist);
311 spin_unlock(&mm->mmu_notifier_mm->lock);
312
310 if (mn->ops->release) 313 if (mn->ops->release)
311 mn->ops->release(mn, mm); 314 mn->ops->release(mn, mm);
312 srcu_read_unlock(&srcu, id);
313 315
314 spin_lock(&mm->mmu_notifier_mm->lock); 316 /*
315 hlist_del_rcu(&mn->hlist); 317 * Allow __mmu_notifier_release() to complete.
318 */
319 srcu_read_unlock(&srcu, id);
320 } else
316 spin_unlock(&mm->mmu_notifier_mm->lock); 321 spin_unlock(&mm->mmu_notifier_mm->lock);
317 }
318 322
319 /* 323 /*
320 * Wait any running method to finish, of course including 324 * Wait for any running method to finish, including ->release() if it
321 * ->release if it was run by mmu_notifier_relase instead of us. 325 * was run by __mmu_notifier_release() instead of us.
322 */ 326 */
323 synchronize_srcu(&srcu); 327 synchronize_srcu(&srcu);
324 328
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4596d81b89b1..2ac0afbd68f3 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * linux/mm/mmzone.c 2 * linux/mm/mmzone.c
3 * 3 *
4 * management codes for pgdats and zones. 4 * management codes for pgdats, zones and page flags
5 */ 5 */
6 6
7 7
@@ -96,3 +96,21 @@ void lruvec_init(struct lruvec *lruvec)
96 for_each_lru(lru) 96 for_each_lru(lru)
97 INIT_LIST_HEAD(&lruvec->lists[lru]); 97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98} 98}
99
100#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS)
101int page_nid_xchg_last(struct page *page, int nid)
102{
103 unsigned long old_flags, flags;
104 int last_nid;
105
106 do {
107 old_flags = flags = page->flags;
108 last_nid = page_nid_last(page);
109
110 flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
111 flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
112 } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
113
114 return last_nid;
115}
116#endif
diff --git a/mm/mremap.c b/mm/mremap.c
index f9766f460299..463a25705ac6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -135,7 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
135 pte_unmap(new_pte - 1); 135 pte_unmap(new_pte - 1);
136 pte_unmap_unlock(old_pte - 1, old_ptl); 136 pte_unmap_unlock(old_pte - 1, old_ptl);
137 if (anon_vma) 137 if (anon_vma)
138 anon_vma_unlock(anon_vma); 138 anon_vma_unlock_write(anon_vma);
139 if (mapping) 139 if (mapping)
140 mutex_unlock(&mapping->i_mmap_mutex); 140 mutex_unlock(&mapping->i_mmap_mutex);
141} 141}
@@ -209,7 +209,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
209 209
210static unsigned long move_vma(struct vm_area_struct *vma, 210static unsigned long move_vma(struct vm_area_struct *vma,
211 unsigned long old_addr, unsigned long old_len, 211 unsigned long old_addr, unsigned long old_len,
212 unsigned long new_len, unsigned long new_addr) 212 unsigned long new_len, unsigned long new_addr, bool *locked)
213{ 213{
214 struct mm_struct *mm = vma->vm_mm; 214 struct mm_struct *mm = vma->vm_mm;
215 struct vm_area_struct *new_vma; 215 struct vm_area_struct *new_vma;
@@ -300,9 +300,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
300 300
301 if (vm_flags & VM_LOCKED) { 301 if (vm_flags & VM_LOCKED) {
302 mm->locked_vm += new_len >> PAGE_SHIFT; 302 mm->locked_vm += new_len >> PAGE_SHIFT;
303 if (new_len > old_len) 303 *locked = true;
304 mlock_vma_pages_range(new_vma, new_addr + old_len,
305 new_addr + new_len);
306 } 304 }
307 305
308 return new_addr; 306 return new_addr;
@@ -367,9 +365,8 @@ Eagain:
367 return ERR_PTR(-EAGAIN); 365 return ERR_PTR(-EAGAIN);
368} 366}
369 367
370static unsigned long mremap_to(unsigned long addr, 368static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
371 unsigned long old_len, unsigned long new_addr, 369 unsigned long new_addr, unsigned long new_len, bool *locked)
372 unsigned long new_len)
373{ 370{
374 struct mm_struct *mm = current->mm; 371 struct mm_struct *mm = current->mm;
375 struct vm_area_struct *vma; 372 struct vm_area_struct *vma;
@@ -419,7 +416,7 @@ static unsigned long mremap_to(unsigned long addr,
419 if (ret & ~PAGE_MASK) 416 if (ret & ~PAGE_MASK)
420 goto out1; 417 goto out1;
421 418
422 ret = move_vma(vma, addr, old_len, new_len, new_addr); 419 ret = move_vma(vma, addr, old_len, new_len, new_addr, locked);
423 if (!(ret & ~PAGE_MASK)) 420 if (!(ret & ~PAGE_MASK))
424 goto out; 421 goto out;
425out1: 422out1:
@@ -457,6 +454,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
457 struct vm_area_struct *vma; 454 struct vm_area_struct *vma;
458 unsigned long ret = -EINVAL; 455 unsigned long ret = -EINVAL;
459 unsigned long charged = 0; 456 unsigned long charged = 0;
457 bool locked = false;
460 458
461 down_write(&current->mm->mmap_sem); 459 down_write(&current->mm->mmap_sem);
462 460
@@ -479,7 +477,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
479 477
480 if (flags & MREMAP_FIXED) { 478 if (flags & MREMAP_FIXED) {
481 if (flags & MREMAP_MAYMOVE) 479 if (flags & MREMAP_MAYMOVE)
482 ret = mremap_to(addr, old_len, new_addr, new_len); 480 ret = mremap_to(addr, old_len, new_addr, new_len,
481 &locked);
483 goto out; 482 goto out;
484 } 483 }
485 484
@@ -521,8 +520,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
521 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 520 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
522 if (vma->vm_flags & VM_LOCKED) { 521 if (vma->vm_flags & VM_LOCKED) {
523 mm->locked_vm += pages; 522 mm->locked_vm += pages;
524 mlock_vma_pages_range(vma, addr + old_len, 523 locked = true;
525 addr + new_len); 524 new_addr = addr;
526 } 525 }
527 ret = addr; 526 ret = addr;
528 goto out; 527 goto out;
@@ -548,11 +547,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
548 goto out; 547 goto out;
549 } 548 }
550 549
551 ret = move_vma(vma, addr, old_len, new_len, new_addr); 550 ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
552 } 551 }
553out: 552out:
554 if (ret & ~PAGE_MASK) 553 if (ret & ~PAGE_MASK)
555 vm_unacct_memory(charged); 554 vm_unacct_memory(charged);
556 up_write(&current->mm->mmap_sem); 555 up_write(&current->mm->mmap_sem);
556 if (locked && new_len > old_len)
557 mm_populate(new_addr + old_len, new_len - old_len);
557 return ret; 558 return ret;
558} 559}
diff --git a/mm/nommu.c b/mm/nommu.c
index b20db4e22263..da0d210fd403 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -140,10 +140,10 @@ unsigned int kobjsize(const void *objp)
140 return PAGE_SIZE << compound_order(page); 140 return PAGE_SIZE << compound_order(page);
141} 141}
142 142
143int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 143long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
144 unsigned long start, int nr_pages, unsigned int foll_flags, 144 unsigned long start, unsigned long nr_pages,
145 struct page **pages, struct vm_area_struct **vmas, 145 unsigned int foll_flags, struct page **pages,
146 int *retry) 146 struct vm_area_struct **vmas, int *nonblocking)
147{ 147{
148 struct vm_area_struct *vma; 148 struct vm_area_struct *vma;
149 unsigned long vm_flags; 149 unsigned long vm_flags;
@@ -190,9 +190,10 @@ finish_or_fault:
190 * slab page or a secondary page from a compound page 190 * slab page or a secondary page from a compound page
191 * - don't permit access to VMAs that don't support it, such as I/O mappings 191 * - don't permit access to VMAs that don't support it, such as I/O mappings
192 */ 192 */
193int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 193long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
194 unsigned long start, int nr_pages, int write, int force, 194 unsigned long start, unsigned long nr_pages,
195 struct page **pages, struct vm_area_struct **vmas) 195 int write, int force, struct page **pages,
196 struct vm_area_struct **vmas)
196{ 197{
197 int flags = 0; 198 int flags = 0;
198 199
@@ -1250,7 +1251,8 @@ unsigned long do_mmap_pgoff(struct file *file,
1250 unsigned long len, 1251 unsigned long len,
1251 unsigned long prot, 1252 unsigned long prot,
1252 unsigned long flags, 1253 unsigned long flags,
1253 unsigned long pgoff) 1254 unsigned long pgoff,
1255 unsigned long *populate)
1254{ 1256{
1255 struct vm_area_struct *vma; 1257 struct vm_area_struct *vma;
1256 struct vm_region *region; 1258 struct vm_region *region;
@@ -1260,6 +1262,8 @@ unsigned long do_mmap_pgoff(struct file *file,
1260 1262
1261 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); 1263 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
1262 1264
1265 *populate = 0;
1266
1263 /* decide whether we should attempt the mapping, and if so what sort of 1267 /* decide whether we should attempt the mapping, and if so what sort of
1264 * mapping */ 1268 * mapping */
1265 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, 1269 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
@@ -1815,9 +1819,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1815 return ret; 1819 return ret;
1816} 1820}
1817 1821
1818struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1822struct page *follow_page_mask(struct vm_area_struct *vma,
1819 unsigned int foll_flags) 1823 unsigned long address, unsigned int flags,
1824 unsigned int *page_mask)
1820{ 1825{
1826 *page_mask = 0;
1821 return NULL; 1827 return NULL;
1822} 1828}
1823 1829
@@ -1904,7 +1910,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1904 */ 1910 */
1905 free -= global_page_state(NR_SHMEM); 1911 free -= global_page_state(NR_SHMEM);
1906 1912
1907 free += nr_swap_pages; 1913 free += get_nr_swap_pages();
1908 1914
1909 /* 1915 /*
1910 * Any slabs which are created with the 1916 * Any slabs which are created with the
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0399f146ae49..79e451a78c9e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -386,8 +386,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
386 cpuset_print_task_mems_allowed(current); 386 cpuset_print_task_mems_allowed(current);
387 task_unlock(current); 387 task_unlock(current);
388 dump_stack(); 388 dump_stack();
389 mem_cgroup_print_oom_info(memcg, p); 389 if (memcg)
390 show_mem(SHOW_MEM_FILTER_NODES); 390 mem_cgroup_print_oom_info(memcg, p);
391 else
392 show_mem(SHOW_MEM_FILTER_NODES);
391 if (sysctl_oom_dump_tasks) 393 if (sysctl_oom_dump_tasks)
392 dump_tasks(memcg, nodemask); 394 dump_tasks(memcg, nodemask);
393} 395}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7300c9d5e1d9..cdc377c456c0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -241,6 +241,9 @@ static unsigned long global_dirtyable_memory(void)
241 if (!vm_highmem_is_dirtyable) 241 if (!vm_highmem_is_dirtyable)
242 x -= highmem_dirtyable_memory(x); 242 x -= highmem_dirtyable_memory(x);
243 243
244 /* Subtract min_free_kbytes */
245 x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
246
244 return x + 1; /* Ensure that we never return 0 */ 247 return x + 1; /* Ensure that we never return 0 */
245} 248}
246 249
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d1107adf174a..e9075fdef695 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -202,11 +202,18 @@ static unsigned long __meminitdata nr_all_pages;
202static unsigned long __meminitdata dma_reserve; 202static unsigned long __meminitdata dma_reserve;
203 203
204#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 204#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
205/* Movable memory ranges, will also be used by memblock subsystem. */
206struct movablemem_map movablemem_map = {
207 .acpi = false,
208 .nr_map = 0,
209};
210
205static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 211static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
206static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 212static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
207static unsigned long __initdata required_kernelcore; 213static unsigned long __initdata required_kernelcore;
208static unsigned long __initdata required_movablecore; 214static unsigned long __initdata required_movablecore;
209static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 215static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
216static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES];
210 217
211/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 218/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
212int movable_zone; 219int movable_zone;
@@ -240,15 +247,20 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
240 int ret = 0; 247 int ret = 0;
241 unsigned seq; 248 unsigned seq;
242 unsigned long pfn = page_to_pfn(page); 249 unsigned long pfn = page_to_pfn(page);
250 unsigned long sp, start_pfn;
243 251
244 do { 252 do {
245 seq = zone_span_seqbegin(zone); 253 seq = zone_span_seqbegin(zone);
246 if (pfn >= zone->zone_start_pfn + zone->spanned_pages) 254 start_pfn = zone->zone_start_pfn;
247 ret = 1; 255 sp = zone->spanned_pages;
248 else if (pfn < zone->zone_start_pfn) 256 if (!zone_spans_pfn(zone, pfn))
249 ret = 1; 257 ret = 1;
250 } while (zone_span_seqretry(zone, seq)); 258 } while (zone_span_seqretry(zone, seq));
251 259
260 if (ret)
261 pr_err("page %lu outside zone [ %lu - %lu ]\n",
262 pfn, start_pfn, start_pfn + sp);
263
252 return ret; 264 return ret;
253} 265}
254 266
@@ -288,7 +300,7 @@ static void bad_page(struct page *page)
288 300
289 /* Don't complain about poisoned pages */ 301 /* Don't complain about poisoned pages */
290 if (PageHWPoison(page)) { 302 if (PageHWPoison(page)) {
291 reset_page_mapcount(page); /* remove PageBuddy */ 303 page_mapcount_reset(page); /* remove PageBuddy */
292 return; 304 return;
293 } 305 }
294 306
@@ -320,7 +332,7 @@ static void bad_page(struct page *page)
320 dump_stack(); 332 dump_stack();
321out: 333out:
322 /* Leave bad fields for debug, except PageBuddy could make trouble */ 334 /* Leave bad fields for debug, except PageBuddy could make trouble */
323 reset_page_mapcount(page); /* remove PageBuddy */ 335 page_mapcount_reset(page); /* remove PageBuddy */
324 add_taint(TAINT_BAD_PAGE); 336 add_taint(TAINT_BAD_PAGE);
325} 337}
326 338
@@ -533,6 +545,8 @@ static inline void __free_one_page(struct page *page,
533 unsigned long uninitialized_var(buddy_idx); 545 unsigned long uninitialized_var(buddy_idx);
534 struct page *buddy; 546 struct page *buddy;
535 547
548 VM_BUG_ON(!zone_is_initialized(zone));
549
536 if (unlikely(PageCompound(page))) 550 if (unlikely(PageCompound(page)))
537 if (unlikely(destroy_compound_page(page, order))) 551 if (unlikely(destroy_compound_page(page, order)))
538 return; 552 return;
@@ -606,7 +620,7 @@ static inline int free_pages_check(struct page *page)
606 bad_page(page); 620 bad_page(page);
607 return 1; 621 return 1;
608 } 622 }
609 reset_page_last_nid(page); 623 page_nid_reset_last(page);
610 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 624 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
611 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 625 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
612 return 0; 626 return 0;
@@ -666,7 +680,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
666 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 680 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
667 __free_one_page(page, zone, 0, mt); 681 __free_one_page(page, zone, 0, mt);
668 trace_mm_page_pcpu_drain(page, 0, mt); 682 trace_mm_page_pcpu_drain(page, 0, mt);
669 if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { 683 if (likely(!is_migrate_isolate_page(page))) {
670 __mod_zone_page_state(zone, NR_FREE_PAGES, 1); 684 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
671 if (is_migrate_cma(mt)) 685 if (is_migrate_cma(mt))
672 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 686 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
@@ -684,7 +698,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
684 zone->pages_scanned = 0; 698 zone->pages_scanned = 0;
685 699
686 __free_one_page(page, zone, order, migratetype); 700 __free_one_page(page, zone, order, migratetype);
687 if (unlikely(migratetype != MIGRATE_ISOLATE)) 701 if (unlikely(!is_migrate_isolate(migratetype)))
688 __mod_zone_freepage_state(zone, 1 << order, migratetype); 702 __mod_zone_freepage_state(zone, 1 << order, migratetype);
689 spin_unlock(&zone->lock); 703 spin_unlock(&zone->lock);
690} 704}
@@ -916,7 +930,9 @@ static int fallbacks[MIGRATE_TYPES][4] = {
916 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, 930 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
917#endif 931#endif
918 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 932 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
933#ifdef CONFIG_MEMORY_ISOLATION
919 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ 934 [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
935#endif
920}; 936};
921 937
922/* 938/*
@@ -981,9 +997,9 @@ int move_freepages_block(struct zone *zone, struct page *page,
981 end_pfn = start_pfn + pageblock_nr_pages - 1; 997 end_pfn = start_pfn + pageblock_nr_pages - 1;
982 998
983 /* Do not cross zone boundaries */ 999 /* Do not cross zone boundaries */
984 if (start_pfn < zone->zone_start_pfn) 1000 if (!zone_spans_pfn(zone, start_pfn))
985 start_page = page; 1001 start_page = page;
986 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) 1002 if (!zone_spans_pfn(zone, end_pfn))
987 return 0; 1003 return 0;
988 1004
989 return move_freepages(zone, start_page, end_page, migratetype); 1005 return move_freepages(zone, start_page, end_page, migratetype);
@@ -1142,7 +1158,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1142 list_add_tail(&page->lru, list); 1158 list_add_tail(&page->lru, list);
1143 if (IS_ENABLED(CONFIG_CMA)) { 1159 if (IS_ENABLED(CONFIG_CMA)) {
1144 mt = get_pageblock_migratetype(page); 1160 mt = get_pageblock_migratetype(page);
1145 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) 1161 if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))
1146 mt = migratetype; 1162 mt = migratetype;
1147 } 1163 }
1148 set_freepage_migratetype(page, mt); 1164 set_freepage_migratetype(page, mt);
@@ -1277,7 +1293,7 @@ void mark_free_pages(struct zone *zone)
1277 1293
1278 spin_lock_irqsave(&zone->lock, flags); 1294 spin_lock_irqsave(&zone->lock, flags);
1279 1295
1280 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1296 max_zone_pfn = zone_end_pfn(zone);
1281 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1297 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1282 if (pfn_valid(pfn)) { 1298 if (pfn_valid(pfn)) {
1283 struct page *page = pfn_to_page(pfn); 1299 struct page *page = pfn_to_page(pfn);
@@ -1326,7 +1342,7 @@ void free_hot_cold_page(struct page *page, int cold)
1326 * excessively into the page allocator 1342 * excessively into the page allocator
1327 */ 1343 */
1328 if (migratetype >= MIGRATE_PCPTYPES) { 1344 if (migratetype >= MIGRATE_PCPTYPES) {
1329 if (unlikely(migratetype == MIGRATE_ISOLATE)) { 1345 if (unlikely(is_migrate_isolate(migratetype))) {
1330 free_one_page(zone, page, 0, migratetype); 1346 free_one_page(zone, page, 0, migratetype);
1331 goto out; 1347 goto out;
1332 } 1348 }
@@ -1400,7 +1416,7 @@ static int __isolate_free_page(struct page *page, unsigned int order)
1400 zone = page_zone(page); 1416 zone = page_zone(page);
1401 mt = get_pageblock_migratetype(page); 1417 mt = get_pageblock_migratetype(page);
1402 1418
1403 if (mt != MIGRATE_ISOLATE) { 1419 if (!is_migrate_isolate(mt)) {
1404 /* Obey watermarks as if the page was being allocated */ 1420 /* Obey watermarks as if the page was being allocated */
1405 watermark = low_wmark_pages(zone) + (1 << order); 1421 watermark = low_wmark_pages(zone) + (1 << order);
1406 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1422 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
@@ -1419,7 +1435,7 @@ static int __isolate_free_page(struct page *page, unsigned int order)
1419 struct page *endpage = page + (1 << order) - 1; 1435 struct page *endpage = page + (1 << order) - 1;
1420 for (; page < endpage; page += pageblock_nr_pages) { 1436 for (; page < endpage; page += pageblock_nr_pages) {
1421 int mt = get_pageblock_migratetype(page); 1437 int mt = get_pageblock_migratetype(page);
1422 if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) 1438 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt))
1423 set_pageblock_migratetype(page, 1439 set_pageblock_migratetype(page,
1424 MIGRATE_MOVABLE); 1440 MIGRATE_MOVABLE);
1425 } 1441 }
@@ -2615,10 +2631,17 @@ retry_cpuset:
2615 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2631 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2616 zonelist, high_zoneidx, alloc_flags, 2632 zonelist, high_zoneidx, alloc_flags,
2617 preferred_zone, migratetype); 2633 preferred_zone, migratetype);
2618 if (unlikely(!page)) 2634 if (unlikely(!page)) {
2635 /*
2636 * Runtime PM, block IO and its error handling path
2637 * can deadlock because I/O on the device might not
2638 * complete.
2639 */
2640 gfp_mask = memalloc_noio_flags(gfp_mask);
2619 page = __alloc_pages_slowpath(gfp_mask, order, 2641 page = __alloc_pages_slowpath(gfp_mask, order,
2620 zonelist, high_zoneidx, nodemask, 2642 zonelist, high_zoneidx, nodemask,
2621 preferred_zone, migratetype); 2643 preferred_zone, migratetype);
2644 }
2622 2645
2623 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2646 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2624 2647
@@ -2790,18 +2813,27 @@ void free_pages_exact(void *virt, size_t size)
2790} 2813}
2791EXPORT_SYMBOL(free_pages_exact); 2814EXPORT_SYMBOL(free_pages_exact);
2792 2815
2793static unsigned int nr_free_zone_pages(int offset) 2816/**
2817 * nr_free_zone_pages - count number of pages beyond high watermark
2818 * @offset: The zone index of the highest zone
2819 *
2820 * nr_free_zone_pages() counts the number of counts pages which are beyond the
2821 * high watermark within all zones at or below a given zone index. For each
2822 * zone, the number of pages is calculated as:
2823 * present_pages - high_pages
2824 */
2825static unsigned long nr_free_zone_pages(int offset)
2794{ 2826{
2795 struct zoneref *z; 2827 struct zoneref *z;
2796 struct zone *zone; 2828 struct zone *zone;
2797 2829
2798 /* Just pick one node, since fallback list is circular */ 2830 /* Just pick one node, since fallback list is circular */
2799 unsigned int sum = 0; 2831 unsigned long sum = 0;
2800 2832
2801 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 2833 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
2802 2834
2803 for_each_zone_zonelist(zone, z, zonelist, offset) { 2835 for_each_zone_zonelist(zone, z, zonelist, offset) {
2804 unsigned long size = zone->present_pages; 2836 unsigned long size = zone->managed_pages;
2805 unsigned long high = high_wmark_pages(zone); 2837 unsigned long high = high_wmark_pages(zone);
2806 if (size > high) 2838 if (size > high)
2807 sum += size - high; 2839 sum += size - high;
@@ -2810,19 +2842,25 @@ static unsigned int nr_free_zone_pages(int offset)
2810 return sum; 2842 return sum;
2811} 2843}
2812 2844
2813/* 2845/**
2814 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 2846 * nr_free_buffer_pages - count number of pages beyond high watermark
2847 *
2848 * nr_free_buffer_pages() counts the number of pages which are beyond the high
2849 * watermark within ZONE_DMA and ZONE_NORMAL.
2815 */ 2850 */
2816unsigned int nr_free_buffer_pages(void) 2851unsigned long nr_free_buffer_pages(void)
2817{ 2852{
2818 return nr_free_zone_pages(gfp_zone(GFP_USER)); 2853 return nr_free_zone_pages(gfp_zone(GFP_USER));
2819} 2854}
2820EXPORT_SYMBOL_GPL(nr_free_buffer_pages); 2855EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
2821 2856
2822/* 2857/**
2823 * Amount of free RAM allocatable within all zones 2858 * nr_free_pagecache_pages - count number of pages beyond high watermark
2859 *
2860 * nr_free_pagecache_pages() counts the number of pages which are beyond the
2861 * high watermark within all zones.
2824 */ 2862 */
2825unsigned int nr_free_pagecache_pages(void) 2863unsigned long nr_free_pagecache_pages(void)
2826{ 2864{
2827 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); 2865 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
2828} 2866}
@@ -2854,7 +2892,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2854 val->totalram = pgdat->node_present_pages; 2892 val->totalram = pgdat->node_present_pages;
2855 val->freeram = node_page_state(nid, NR_FREE_PAGES); 2893 val->freeram = node_page_state(nid, NR_FREE_PAGES);
2856#ifdef CONFIG_HIGHMEM 2894#ifdef CONFIG_HIGHMEM
2857 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 2895 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
2858 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 2896 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
2859 NR_FREE_PAGES); 2897 NR_FREE_PAGES);
2860#else 2898#else
@@ -2897,7 +2935,9 @@ static void show_migration_types(unsigned char type)
2897#ifdef CONFIG_CMA 2935#ifdef CONFIG_CMA
2898 [MIGRATE_CMA] = 'C', 2936 [MIGRATE_CMA] = 'C',
2899#endif 2937#endif
2938#ifdef CONFIG_MEMORY_ISOLATION
2900 [MIGRATE_ISOLATE] = 'I', 2939 [MIGRATE_ISOLATE] = 'I',
2940#endif
2901 }; 2941 };
2902 char tmp[MIGRATE_TYPES + 1]; 2942 char tmp[MIGRATE_TYPES + 1];
2903 char *p = tmp; 2943 char *p = tmp;
@@ -3236,7 +3276,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
3236{ 3276{
3237 int n, val; 3277 int n, val;
3238 int min_val = INT_MAX; 3278 int min_val = INT_MAX;
3239 int best_node = -1; 3279 int best_node = NUMA_NO_NODE;
3240 const struct cpumask *tmp = cpumask_of_node(0); 3280 const struct cpumask *tmp = cpumask_of_node(0);
3241 3281
3242 /* Use the local node if we haven't already */ 3282 /* Use the local node if we haven't already */
@@ -3780,7 +3820,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3780 * the block. 3820 * the block.
3781 */ 3821 */
3782 start_pfn = zone->zone_start_pfn; 3822 start_pfn = zone->zone_start_pfn;
3783 end_pfn = start_pfn + zone->spanned_pages; 3823 end_pfn = zone_end_pfn(zone);
3784 start_pfn = roundup(start_pfn, pageblock_nr_pages); 3824 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3785 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3825 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3786 pageblock_order; 3826 pageblock_order;
@@ -3876,8 +3916,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3876 set_page_links(page, zone, nid, pfn); 3916 set_page_links(page, zone, nid, pfn);
3877 mminit_verify_page_links(page, zone, nid, pfn); 3917 mminit_verify_page_links(page, zone, nid, pfn);
3878 init_page_count(page); 3918 init_page_count(page);
3879 reset_page_mapcount(page); 3919 page_mapcount_reset(page);
3880 reset_page_last_nid(page); 3920 page_nid_reset_last(page);
3881 SetPageReserved(page); 3921 SetPageReserved(page);
3882 /* 3922 /*
3883 * Mark the block movable so that blocks are reserved for 3923 * Mark the block movable so that blocks are reserved for
@@ -3894,7 +3934,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3894 * pfn out of zone. 3934 * pfn out of zone.
3895 */ 3935 */
3896 if ((z->zone_start_pfn <= pfn) 3936 if ((z->zone_start_pfn <= pfn)
3897 && (pfn < z->zone_start_pfn + z->spanned_pages) 3937 && (pfn < zone_end_pfn(z))
3898 && !(pfn & (pageblock_nr_pages - 1))) 3938 && !(pfn & (pageblock_nr_pages - 1)))
3899 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 3939 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
3900 3940
@@ -3932,7 +3972,7 @@ static int __meminit zone_batchsize(struct zone *zone)
3932 * 3972 *
3933 * OK, so we don't know how big the cache is. So guess. 3973 * OK, so we don't know how big the cache is. So guess.
3934 */ 3974 */
3935 batch = zone->present_pages / 1024; 3975 batch = zone->managed_pages / 1024;
3936 if (batch * PAGE_SIZE > 512 * 1024) 3976 if (batch * PAGE_SIZE > 512 * 1024)
3937 batch = (512 * 1024) / PAGE_SIZE; 3977 batch = (512 * 1024) / PAGE_SIZE;
3938 batch /= 4; /* We effectively *= 4 below */ 3978 batch /= 4; /* We effectively *= 4 below */
@@ -4016,7 +4056,7 @@ static void __meminit setup_zone_pageset(struct zone *zone)
4016 4056
4017 if (percpu_pagelist_fraction) 4057 if (percpu_pagelist_fraction)
4018 setup_pagelist_highmark(pcp, 4058 setup_pagelist_highmark(pcp,
4019 (zone->present_pages / 4059 (zone->managed_pages /
4020 percpu_pagelist_fraction)); 4060 percpu_pagelist_fraction));
4021 } 4061 }
4022} 4062}
@@ -4372,6 +4412,77 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
4372 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4412 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4373} 4413}
4374 4414
4415/**
4416 * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array.
4417 *
4418 * zone_movable_limit is initialized as 0. This function will try to get
4419 * the first ZONE_MOVABLE pfn of each node from movablemem_map, and
4420 * assigne them to zone_movable_limit.
4421 * zone_movable_limit[nid] == 0 means no limit for the node.
4422 *
4423 * Note: Each range is represented as [start_pfn, end_pfn)
4424 */
4425static void __meminit sanitize_zone_movable_limit(void)
4426{
4427 int map_pos = 0, i, nid;
4428 unsigned long start_pfn, end_pfn;
4429
4430 if (!movablemem_map.nr_map)
4431 return;
4432
4433 /* Iterate all ranges from minimum to maximum */
4434 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
4435 /*
4436 * If we have found lowest pfn of ZONE_MOVABLE of the node
4437 * specified by user, just go on to check next range.
4438 */
4439 if (zone_movable_limit[nid])
4440 continue;
4441
4442#ifdef CONFIG_ZONE_DMA
4443 /* Skip DMA memory. */
4444 if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA])
4445 start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA];
4446#endif
4447
4448#ifdef CONFIG_ZONE_DMA32
4449 /* Skip DMA32 memory. */
4450 if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32])
4451 start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32];
4452#endif
4453
4454#ifdef CONFIG_HIGHMEM
4455 /* Skip lowmem if ZONE_MOVABLE is highmem. */
4456 if (zone_movable_is_highmem() &&
4457 start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])
4458 start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM];
4459#endif
4460
4461 if (start_pfn >= end_pfn)
4462 continue;
4463
4464 while (map_pos < movablemem_map.nr_map) {
4465 if (end_pfn <= movablemem_map.map[map_pos].start_pfn)
4466 break;
4467
4468 if (start_pfn >= movablemem_map.map[map_pos].end_pfn) {
4469 map_pos++;
4470 continue;
4471 }
4472
4473 /*
4474 * The start_pfn of ZONE_MOVABLE is either the minimum
4475 * pfn specified by movablemem_map, or 0, which means
4476 * the node has no ZONE_MOVABLE.
4477 */
4478 zone_movable_limit[nid] = max(start_pfn,
4479 movablemem_map.map[map_pos].start_pfn);
4480
4481 break;
4482 }
4483 }
4484}
4485
4375#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4486#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4376static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4487static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4377 unsigned long zone_type, 4488 unsigned long zone_type,
@@ -4389,7 +4500,6 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4389 4500
4390 return zholes_size[zone_type]; 4501 return zholes_size[zone_type];
4391} 4502}
4392
4393#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4503#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4394 4504
4395static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4505static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
@@ -4573,7 +4683,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4573 nr_all_pages += freesize; 4683 nr_all_pages += freesize;
4574 4684
4575 zone->spanned_pages = size; 4685 zone->spanned_pages = size;
4576 zone->present_pages = freesize; 4686 zone->present_pages = realsize;
4577 /* 4687 /*
4578 * Set an approximate value for lowmem here, it will be adjusted 4688 * Set an approximate value for lowmem here, it will be adjusted
4579 * when the bootmem allocator frees pages into the buddy system. 4689 * when the bootmem allocator frees pages into the buddy system.
@@ -4625,7 +4735,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4625 * for the buddy allocator to function correctly. 4735 * for the buddy allocator to function correctly.
4626 */ 4736 */
4627 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 4737 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
4628 end = pgdat->node_start_pfn + pgdat->node_spanned_pages; 4738 end = pgdat_end_pfn(pgdat);
4629 end = ALIGN(end, MAX_ORDER_NR_PAGES); 4739 end = ALIGN(end, MAX_ORDER_NR_PAGES);
4630 size = (end - start) * sizeof(struct page); 4740 size = (end - start) * sizeof(struct page);
4631 map = alloc_remap(pgdat->node_id, size); 4741 map = alloc_remap(pgdat->node_id, size);
@@ -4831,12 +4941,19 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4831 required_kernelcore = max(required_kernelcore, corepages); 4941 required_kernelcore = max(required_kernelcore, corepages);
4832 } 4942 }
4833 4943
4834 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 4944 /*
4835 if (!required_kernelcore) 4945 * If neither kernelcore/movablecore nor movablemem_map is specified,
4946 * there is no ZONE_MOVABLE. But if movablemem_map is specified, the
4947 * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[].
4948 */
4949 if (!required_kernelcore) {
4950 if (movablemem_map.nr_map)
4951 memcpy(zone_movable_pfn, zone_movable_limit,
4952 sizeof(zone_movable_pfn));
4836 goto out; 4953 goto out;
4954 }
4837 4955
4838 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 4956 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
4839 find_usable_zone_for_movable();
4840 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; 4957 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
4841 4958
4842restart: 4959restart:
@@ -4864,10 +4981,24 @@ restart:
4864 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { 4981 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4865 unsigned long size_pages; 4982 unsigned long size_pages;
4866 4983
4984 /*
4985 * Find more memory for kernelcore in
4986 * [zone_movable_pfn[nid], zone_movable_limit[nid]).
4987 */
4867 start_pfn = max(start_pfn, zone_movable_pfn[nid]); 4988 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
4868 if (start_pfn >= end_pfn) 4989 if (start_pfn >= end_pfn)
4869 continue; 4990 continue;
4870 4991
4992 if (zone_movable_limit[nid]) {
4993 end_pfn = min(end_pfn, zone_movable_limit[nid]);
4994 /* No range left for kernelcore in this node */
4995 if (start_pfn >= end_pfn) {
4996 zone_movable_pfn[nid] =
4997 zone_movable_limit[nid];
4998 break;
4999 }
5000 }
5001
4871 /* Account for what is only usable for kernelcore */ 5002 /* Account for what is only usable for kernelcore */
4872 if (start_pfn < usable_startpfn) { 5003 if (start_pfn < usable_startpfn) {
4873 unsigned long kernel_pages; 5004 unsigned long kernel_pages;
@@ -4927,12 +5058,12 @@ restart:
4927 if (usable_nodes && required_kernelcore > usable_nodes) 5058 if (usable_nodes && required_kernelcore > usable_nodes)
4928 goto restart; 5059 goto restart;
4929 5060
5061out:
4930 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ 5062 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
4931 for (nid = 0; nid < MAX_NUMNODES; nid++) 5063 for (nid = 0; nid < MAX_NUMNODES; nid++)
4932 zone_movable_pfn[nid] = 5064 zone_movable_pfn[nid] =
4933 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 5065 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4934 5066
4935out:
4936 /* restore the node_state */ 5067 /* restore the node_state */
4937 node_states[N_MEMORY] = saved_node_state; 5068 node_states[N_MEMORY] = saved_node_state;
4938} 5069}
@@ -4995,6 +5126,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4995 5126
4996 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 5127 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
4997 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 5128 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
5129 find_usable_zone_for_movable();
5130 sanitize_zone_movable_limit();
4998 find_zone_movable_pfns_for_nodes(); 5131 find_zone_movable_pfns_for_nodes();
4999 5132
5000 /* Print out the zone ranges */ 5133 /* Print out the zone ranges */
@@ -5078,6 +5211,181 @@ static int __init cmdline_parse_movablecore(char *p)
5078early_param("kernelcore", cmdline_parse_kernelcore); 5211early_param("kernelcore", cmdline_parse_kernelcore);
5079early_param("movablecore", cmdline_parse_movablecore); 5212early_param("movablecore", cmdline_parse_movablecore);
5080 5213
5214/**
5215 * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[].
5216 * @start_pfn: start pfn of the range to be checked
5217 * @end_pfn: end pfn of the range to be checked (exclusive)
5218 *
5219 * This function checks if a given memory range [start_pfn, end_pfn) overlaps
5220 * the movablemem_map.map[] array.
5221 *
5222 * Return: index of the first overlapped element in movablemem_map.map[]
5223 * or -1 if they don't overlap each other.
5224 */
5225int __init movablemem_map_overlap(unsigned long start_pfn,
5226 unsigned long end_pfn)
5227{
5228 int overlap;
5229
5230 if (!movablemem_map.nr_map)
5231 return -1;
5232
5233 for (overlap = 0; overlap < movablemem_map.nr_map; overlap++)
5234 if (start_pfn < movablemem_map.map[overlap].end_pfn)
5235 break;
5236
5237 if (overlap == movablemem_map.nr_map ||
5238 end_pfn <= movablemem_map.map[overlap].start_pfn)
5239 return -1;
5240
5241 return overlap;
5242}
5243
5244/**
5245 * insert_movablemem_map - Insert a memory range in to movablemem_map.map.
5246 * @start_pfn: start pfn of the range
5247 * @end_pfn: end pfn of the range
5248 *
5249 * This function will also merge the overlapped ranges, and sort the array
5250 * by start_pfn in monotonic increasing order.
5251 */
5252void __init insert_movablemem_map(unsigned long start_pfn,
5253 unsigned long end_pfn)
5254{
5255 int pos, overlap;
5256
5257 /*
5258 * pos will be at the 1st overlapped range, or the position
5259 * where the element should be inserted.
5260 */
5261 for (pos = 0; pos < movablemem_map.nr_map; pos++)
5262 if (start_pfn <= movablemem_map.map[pos].end_pfn)
5263 break;
5264
5265 /* If there is no overlapped range, just insert the element. */
5266 if (pos == movablemem_map.nr_map ||
5267 end_pfn < movablemem_map.map[pos].start_pfn) {
5268 /*
5269 * If pos is not the end of array, we need to move all
5270 * the rest elements backward.
5271 */
5272 if (pos < movablemem_map.nr_map)
5273 memmove(&movablemem_map.map[pos+1],
5274 &movablemem_map.map[pos],
5275 sizeof(struct movablemem_entry) *
5276 (movablemem_map.nr_map - pos));
5277 movablemem_map.map[pos].start_pfn = start_pfn;
5278 movablemem_map.map[pos].end_pfn = end_pfn;
5279 movablemem_map.nr_map++;
5280 return;
5281 }
5282
5283 /* overlap will be at the last overlapped range */
5284 for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++)
5285 if (end_pfn < movablemem_map.map[overlap].start_pfn)
5286 break;
5287
5288 /*
5289 * If there are more ranges overlapped, we need to merge them,
5290 * and move the rest elements forward.
5291 */
5292 overlap--;
5293 movablemem_map.map[pos].start_pfn = min(start_pfn,
5294 movablemem_map.map[pos].start_pfn);
5295 movablemem_map.map[pos].end_pfn = max(end_pfn,
5296 movablemem_map.map[overlap].end_pfn);
5297
5298 if (pos != overlap && overlap + 1 != movablemem_map.nr_map)
5299 memmove(&movablemem_map.map[pos+1],
5300 &movablemem_map.map[overlap+1],
5301 sizeof(struct movablemem_entry) *
5302 (movablemem_map.nr_map - overlap - 1));
5303
5304 movablemem_map.nr_map -= overlap - pos;
5305}
5306
5307/**
5308 * movablemem_map_add_region - Add a memory range into movablemem_map.
5309 * @start: physical start address of range
5310 * @end: physical end address of range
5311 *
5312 * This function transform the physical address into pfn, and then add the
5313 * range into movablemem_map by calling insert_movablemem_map().
5314 */
5315static void __init movablemem_map_add_region(u64 start, u64 size)
5316{
5317 unsigned long start_pfn, end_pfn;
5318
5319 /* In case size == 0 or start + size overflows */
5320 if (start + size <= start)
5321 return;
5322
5323 if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) {
5324 pr_err("movablemem_map: too many entries;"
5325 " ignoring [mem %#010llx-%#010llx]\n",
5326 (unsigned long long) start,
5327 (unsigned long long) (start + size - 1));
5328 return;
5329 }
5330
5331 start_pfn = PFN_DOWN(start);
5332 end_pfn = PFN_UP(start + size);
5333 insert_movablemem_map(start_pfn, end_pfn);
5334}
5335
5336/*
5337 * cmdline_parse_movablemem_map - Parse boot option movablemem_map.
5338 * @p: The boot option of the following format:
5339 * movablemem_map=nn[KMG]@ss[KMG]
5340 *
5341 * This option sets the memory range [ss, ss+nn) to be used as movable memory.
5342 *
5343 * Return: 0 on success or -EINVAL on failure.
5344 */
5345static int __init cmdline_parse_movablemem_map(char *p)
5346{
5347 char *oldp;
5348 u64 start_at, mem_size;
5349
5350 if (!p)
5351 goto err;
5352
5353 if (!strcmp(p, "acpi"))
5354 movablemem_map.acpi = true;
5355
5356 /*
5357 * If user decide to use info from BIOS, all the other user specified
5358 * ranges will be ingored.
5359 */
5360 if (movablemem_map.acpi) {
5361 if (movablemem_map.nr_map) {
5362 memset(movablemem_map.map, 0,
5363 sizeof(struct movablemem_entry)
5364 * movablemem_map.nr_map);
5365 movablemem_map.nr_map = 0;
5366 }
5367 return 0;
5368 }
5369
5370 oldp = p;
5371 mem_size = memparse(p, &p);
5372 if (p == oldp)
5373 goto err;
5374
5375 if (*p == '@') {
5376 oldp = ++p;
5377 start_at = memparse(p, &p);
5378 if (p == oldp || *p != '\0')
5379 goto err;
5380
5381 movablemem_map_add_region(start_at, mem_size);
5382 return 0;
5383 }
5384err:
5385 return -EINVAL;
5386}
5387early_param("movablemem_map", cmdline_parse_movablemem_map);
5388
5081#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5389#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5082 5390
5083/** 5391/**
@@ -5160,8 +5468,8 @@ static void calculate_totalreserve_pages(void)
5160 /* we treat the high watermark as reserved pages. */ 5468 /* we treat the high watermark as reserved pages. */
5161 max += high_wmark_pages(zone); 5469 max += high_wmark_pages(zone);
5162 5470
5163 if (max > zone->present_pages) 5471 if (max > zone->managed_pages)
5164 max = zone->present_pages; 5472 max = zone->managed_pages;
5165 reserve_pages += max; 5473 reserve_pages += max;
5166 /* 5474 /*
5167 * Lowmem reserves are not available to 5475 * Lowmem reserves are not available to
@@ -5193,7 +5501,7 @@ static void setup_per_zone_lowmem_reserve(void)
5193 for_each_online_pgdat(pgdat) { 5501 for_each_online_pgdat(pgdat) {
5194 for (j = 0; j < MAX_NR_ZONES; j++) { 5502 for (j = 0; j < MAX_NR_ZONES; j++) {
5195 struct zone *zone = pgdat->node_zones + j; 5503 struct zone *zone = pgdat->node_zones + j;
5196 unsigned long present_pages = zone->present_pages; 5504 unsigned long managed_pages = zone->managed_pages;
5197 5505
5198 zone->lowmem_reserve[j] = 0; 5506 zone->lowmem_reserve[j] = 0;
5199 5507
@@ -5207,9 +5515,9 @@ static void setup_per_zone_lowmem_reserve(void)
5207 sysctl_lowmem_reserve_ratio[idx] = 1; 5515 sysctl_lowmem_reserve_ratio[idx] = 1;
5208 5516
5209 lower_zone = pgdat->node_zones + idx; 5517 lower_zone = pgdat->node_zones + idx;
5210 lower_zone->lowmem_reserve[j] = present_pages / 5518 lower_zone->lowmem_reserve[j] = managed_pages /
5211 sysctl_lowmem_reserve_ratio[idx]; 5519 sysctl_lowmem_reserve_ratio[idx];
5212 present_pages += lower_zone->present_pages; 5520 managed_pages += lower_zone->managed_pages;
5213 } 5521 }
5214 } 5522 }
5215 } 5523 }
@@ -5228,14 +5536,14 @@ static void __setup_per_zone_wmarks(void)
5228 /* Calculate total number of !ZONE_HIGHMEM pages */ 5536 /* Calculate total number of !ZONE_HIGHMEM pages */
5229 for_each_zone(zone) { 5537 for_each_zone(zone) {
5230 if (!is_highmem(zone)) 5538 if (!is_highmem(zone))
5231 lowmem_pages += zone->present_pages; 5539 lowmem_pages += zone->managed_pages;
5232 } 5540 }
5233 5541
5234 for_each_zone(zone) { 5542 for_each_zone(zone) {
5235 u64 tmp; 5543 u64 tmp;
5236 5544
5237 spin_lock_irqsave(&zone->lock, flags); 5545 spin_lock_irqsave(&zone->lock, flags);
5238 tmp = (u64)pages_min * zone->present_pages; 5546 tmp = (u64)pages_min * zone->managed_pages;
5239 do_div(tmp, lowmem_pages); 5547 do_div(tmp, lowmem_pages);
5240 if (is_highmem(zone)) { 5548 if (is_highmem(zone)) {
5241 /* 5549 /*
@@ -5247,13 +5555,10 @@ static void __setup_per_zone_wmarks(void)
5247 * deltas controls asynch page reclaim, and so should 5555 * deltas controls asynch page reclaim, and so should
5248 * not be capped for highmem. 5556 * not be capped for highmem.
5249 */ 5557 */
5250 int min_pages; 5558 unsigned long min_pages;
5251 5559
5252 min_pages = zone->present_pages / 1024; 5560 min_pages = zone->managed_pages / 1024;
5253 if (min_pages < SWAP_CLUSTER_MAX) 5561 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
5254 min_pages = SWAP_CLUSTER_MAX;
5255 if (min_pages > 128)
5256 min_pages = 128;
5257 zone->watermark[WMARK_MIN] = min_pages; 5562 zone->watermark[WMARK_MIN] = min_pages;
5258 } else { 5563 } else {
5259 /* 5564 /*
@@ -5314,7 +5619,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
5314 unsigned int gb, ratio; 5619 unsigned int gb, ratio;
5315 5620
5316 /* Zone size in gigabytes */ 5621 /* Zone size in gigabytes */
5317 gb = zone->present_pages >> (30 - PAGE_SHIFT); 5622 gb = zone->managed_pages >> (30 - PAGE_SHIFT);
5318 if (gb) 5623 if (gb)
5319 ratio = int_sqrt(10 * gb); 5624 ratio = int_sqrt(10 * gb);
5320 else 5625 else
@@ -5400,7 +5705,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
5400 return rc; 5705 return rc;
5401 5706
5402 for_each_zone(zone) 5707 for_each_zone(zone)
5403 zone->min_unmapped_pages = (zone->present_pages * 5708 zone->min_unmapped_pages = (zone->managed_pages *
5404 sysctl_min_unmapped_ratio) / 100; 5709 sysctl_min_unmapped_ratio) / 100;
5405 return 0; 5710 return 0;
5406} 5711}
@@ -5416,7 +5721,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
5416 return rc; 5721 return rc;
5417 5722
5418 for_each_zone(zone) 5723 for_each_zone(zone)
5419 zone->min_slab_pages = (zone->present_pages * 5724 zone->min_slab_pages = (zone->managed_pages *
5420 sysctl_min_slab_ratio) / 100; 5725 sysctl_min_slab_ratio) / 100;
5421 return 0; 5726 return 0;
5422} 5727}
@@ -5458,7 +5763,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5458 for_each_populated_zone(zone) { 5763 for_each_populated_zone(zone) {
5459 for_each_possible_cpu(cpu) { 5764 for_each_possible_cpu(cpu) {
5460 unsigned long high; 5765 unsigned long high;
5461 high = zone->present_pages / percpu_pagelist_fraction; 5766 high = zone->managed_pages / percpu_pagelist_fraction;
5462 setup_pagelist_highmark( 5767 setup_pagelist_highmark(
5463 per_cpu_ptr(zone->pageset, cpu), high); 5768 per_cpu_ptr(zone->pageset, cpu), high);
5464 } 5769 }
@@ -5645,8 +5950,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5645 pfn = page_to_pfn(page); 5950 pfn = page_to_pfn(page);
5646 bitmap = get_pageblock_bitmap(zone, pfn); 5951 bitmap = get_pageblock_bitmap(zone, pfn);
5647 bitidx = pfn_to_bitidx(zone, pfn); 5952 bitidx = pfn_to_bitidx(zone, pfn);
5648 VM_BUG_ON(pfn < zone->zone_start_pfn); 5953 VM_BUG_ON(!zone_spans_pfn(zone, pfn));
5649 VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages);
5650 5954
5651 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) 5955 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
5652 if (flags & value) 5956 if (flags & value)
@@ -5744,8 +6048,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5744 6048
5745 zone = page_zone(page); 6049 zone = page_zone(page);
5746 pfn = page_to_pfn(page); 6050 pfn = page_to_pfn(page);
5747 if (zone->zone_start_pfn > pfn || 6051 if (!zone_spans_pfn(zone, pfn))
5748 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5749 return false; 6052 return false;
5750 6053
5751 return !has_unmovable_pages(zone, page, 0, true); 6054 return !has_unmovable_pages(zone, page, 0, true);
@@ -5801,14 +6104,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5801 &cc->migratepages); 6104 &cc->migratepages);
5802 cc->nr_migratepages -= nr_reclaimed; 6105 cc->nr_migratepages -= nr_reclaimed;
5803 6106
5804 ret = migrate_pages(&cc->migratepages, 6107 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
5805 alloc_migrate_target, 6108 0, MIGRATE_SYNC, MR_CMA);
5806 0, false, MIGRATE_SYNC,
5807 MR_CMA);
5808 } 6109 }
5809 6110 if (ret < 0) {
5810 putback_movable_pages(&cc->migratepages); 6111 putback_movable_pages(&cc->migratepages);
5811 return ret > 0 ? 0 : ret; 6112 return ret;
6113 }
6114 return 0;
5812} 6115}
5813 6116
5814/** 6117/**
diff --git a/mm/rmap.c b/mm/rmap.c
index 3d38edffda41..807c96bf0dc6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -105,7 +105,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
105 */ 105 */
106 if (rwsem_is_locked(&anon_vma->root->rwsem)) { 106 if (rwsem_is_locked(&anon_vma->root->rwsem)) {
107 anon_vma_lock_write(anon_vma); 107 anon_vma_lock_write(anon_vma);
108 anon_vma_unlock(anon_vma); 108 anon_vma_unlock_write(anon_vma);
109 } 109 }
110 110
111 kmem_cache_free(anon_vma_cachep, anon_vma); 111 kmem_cache_free(anon_vma_cachep, anon_vma);
@@ -191,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
191 avc = NULL; 191 avc = NULL;
192 } 192 }
193 spin_unlock(&mm->page_table_lock); 193 spin_unlock(&mm->page_table_lock);
194 anon_vma_unlock(anon_vma); 194 anon_vma_unlock_write(anon_vma);
195 195
196 if (unlikely(allocated)) 196 if (unlikely(allocated))
197 put_anon_vma(allocated); 197 put_anon_vma(allocated);
@@ -308,7 +308,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
308 vma->anon_vma = anon_vma; 308 vma->anon_vma = anon_vma;
309 anon_vma_lock_write(anon_vma); 309 anon_vma_lock_write(anon_vma);
310 anon_vma_chain_link(vma, avc, anon_vma); 310 anon_vma_chain_link(vma, avc, anon_vma);
311 anon_vma_unlock(anon_vma); 311 anon_vma_unlock_write(anon_vma);
312 312
313 return 0; 313 return 0;
314 314
diff --git a/mm/shmem.c b/mm/shmem.c
index 5dd56f6efdbd..1ad79243cb7b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -335,19 +335,19 @@ static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
335 pgoff_t start, unsigned int nr_pages, 335 pgoff_t start, unsigned int nr_pages,
336 struct page **pages, pgoff_t *indices) 336 struct page **pages, pgoff_t *indices)
337{ 337{
338 unsigned int i; 338 void **slot;
339 unsigned int ret; 339 unsigned int ret = 0;
340 unsigned int nr_found; 340 struct radix_tree_iter iter;
341
342 if (!nr_pages)
343 return 0;
341 344
342 rcu_read_lock(); 345 rcu_read_lock();
343restart: 346restart:
344 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 347 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
345 (void ***)pages, indices, start, nr_pages);
346 ret = 0;
347 for (i = 0; i < nr_found; i++) {
348 struct page *page; 348 struct page *page;
349repeat: 349repeat:
350 page = radix_tree_deref_slot((void **)pages[i]); 350 page = radix_tree_deref_slot(slot);
351 if (unlikely(!page)) 351 if (unlikely(!page))
352 continue; 352 continue;
353 if (radix_tree_exception(page)) { 353 if (radix_tree_exception(page)) {
@@ -364,17 +364,16 @@ repeat:
364 goto repeat; 364 goto repeat;
365 365
366 /* Has the page moved? */ 366 /* Has the page moved? */
367 if (unlikely(page != *((void **)pages[i]))) { 367 if (unlikely(page != *slot)) {
368 page_cache_release(page); 368 page_cache_release(page);
369 goto repeat; 369 goto repeat;
370 } 370 }
371export: 371export:
372 indices[ret] = indices[i]; 372 indices[ret] = iter.index;
373 pages[ret] = page; 373 pages[ret] = page;
374 ret++; 374 if (++ret == nr_pages)
375 break;
375 } 376 }
376 if (unlikely(!ret && nr_found))
377 goto restart;
378 rcu_read_unlock(); 377 rcu_read_unlock();
379 return ret; 378 return ret;
380} 379}
@@ -2386,6 +2385,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2386 bool remount) 2385 bool remount)
2387{ 2386{
2388 char *this_char, *value, *rest; 2387 char *this_char, *value, *rest;
2388 struct mempolicy *mpol = NULL;
2389 uid_t uid; 2389 uid_t uid;
2390 gid_t gid; 2390 gid_t gid;
2391 2391
@@ -2414,7 +2414,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2414 printk(KERN_ERR 2414 printk(KERN_ERR
2415 "tmpfs: No value for mount option '%s'\n", 2415 "tmpfs: No value for mount option '%s'\n",
2416 this_char); 2416 this_char);
2417 return 1; 2417 goto error;
2418 } 2418 }
2419 2419
2420 if (!strcmp(this_char,"size")) { 2420 if (!strcmp(this_char,"size")) {
@@ -2463,19 +2463,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2463 if (!gid_valid(sbinfo->gid)) 2463 if (!gid_valid(sbinfo->gid))
2464 goto bad_val; 2464 goto bad_val;
2465 } else if (!strcmp(this_char,"mpol")) { 2465 } else if (!strcmp(this_char,"mpol")) {
2466 if (mpol_parse_str(value, &sbinfo->mpol)) 2466 mpol_put(mpol);
2467 mpol = NULL;
2468 if (mpol_parse_str(value, &mpol))
2467 goto bad_val; 2469 goto bad_val;
2468 } else { 2470 } else {
2469 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 2471 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
2470 this_char); 2472 this_char);
2471 return 1; 2473 goto error;
2472 } 2474 }
2473 } 2475 }
2476 sbinfo->mpol = mpol;
2474 return 0; 2477 return 0;
2475 2478
2476bad_val: 2479bad_val:
2477 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", 2480 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
2478 value, this_char); 2481 value, this_char);
2482error:
2483 mpol_put(mpol);
2479 return 1; 2484 return 1;
2480 2485
2481} 2486}
@@ -2487,6 +2492,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2487 unsigned long inodes; 2492 unsigned long inodes;
2488 int error = -EINVAL; 2493 int error = -EINVAL;
2489 2494
2495 config.mpol = NULL;
2490 if (shmem_parse_options(data, &config, true)) 2496 if (shmem_parse_options(data, &config, true))
2491 return error; 2497 return error;
2492 2498
@@ -2511,8 +2517,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2511 sbinfo->max_inodes = config.max_inodes; 2517 sbinfo->max_inodes = config.max_inodes;
2512 sbinfo->free_inodes = config.max_inodes - inodes; 2518 sbinfo->free_inodes = config.max_inodes - inodes;
2513 2519
2514 mpol_put(sbinfo->mpol); 2520 /*
2515 sbinfo->mpol = config.mpol; /* transfers initial ref */ 2521 * Preserve previous mempolicy unless mpol remount option was specified.
2522 */
2523 if (config.mpol) {
2524 mpol_put(sbinfo->mpol);
2525 sbinfo->mpol = config.mpol; /* transfers initial ref */
2526 }
2516out: 2527out:
2517 spin_unlock(&sbinfo->stat_lock); 2528 spin_unlock(&sbinfo->stat_lock);
2518 return error; 2529 return error;
@@ -2545,6 +2556,7 @@ static void shmem_put_super(struct super_block *sb)
2545 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 2556 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
2546 2557
2547 percpu_counter_destroy(&sbinfo->used_blocks); 2558 percpu_counter_destroy(&sbinfo->used_blocks);
2559 mpol_put(sbinfo->mpol);
2548 kfree(sbinfo); 2560 kfree(sbinfo);
2549 sb->s_fs_info = NULL; 2561 sb->s_fs_info = NULL;
2550} 2562}
diff --git a/mm/slob.c b/mm/slob.c
index a99fdf7a0907..eeed4a05a2ef 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size)
360 clear_slob_page_free(sp); 360 clear_slob_page_free(sp);
361 spin_unlock_irqrestore(&slob_lock, flags); 361 spin_unlock_irqrestore(&slob_lock, flags);
362 __ClearPageSlab(sp); 362 __ClearPageSlab(sp);
363 reset_page_mapcount(sp); 363 page_mapcount_reset(sp);
364 slob_free_pages(b, 0); 364 slob_free_pages(b, 0);
365 return; 365 return;
366 } 366 }
diff --git a/mm/slub.c b/mm/slub.c
index ba2ca53f6c3a..ebcc44eb43b9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1408,7 +1408,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1408 __ClearPageSlab(page); 1408 __ClearPageSlab(page);
1409 1409
1410 memcg_release_pages(s, order); 1410 memcg_release_pages(s, order);
1411 reset_page_mapcount(page); 1411 page_mapcount_reset(page);
1412 if (current->reclaim_state) 1412 if (current->reclaim_state)
1413 current->reclaim_state->reclaimed_slab += pages; 1413 current->reclaim_state->reclaimed_slab += pages;
1414 __free_memcg_kmem_pages(page, order); 1414 __free_memcg_kmem_pages(page, order);
diff --git a/mm/sparse.c b/mm/sparse.c
index 6b5fb762e2ca..7ca6dc847947 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
615} 615}
616static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) 616static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
617{ 617{
618 return; /* XXX: Not implemented yet */ 618 vmemmap_free(memmap, nr_pages);
619} 619}
620static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) 620static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
621{ 621{
622 vmemmap_free(memmap, nr_pages);
622} 623}
623#else 624#else
624static struct page *__kmalloc_section_memmap(unsigned long nr_pages) 625static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
@@ -697,7 +698,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
697 /* 698 /*
698 * Check to see if allocation came from hot-plug-add 699 * Check to see if allocation came from hot-plug-add
699 */ 700 */
700 if (PageSlab(usemap_page)) { 701 if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
701 kfree(usemap); 702 kfree(usemap);
702 if (memmap) 703 if (memmap)
703 __kfree_section_memmap(memmap, PAGES_PER_SECTION); 704 __kfree_section_memmap(memmap, PAGES_PER_SECTION);
@@ -782,7 +783,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
782 783
783 for (i = 0; i < PAGES_PER_SECTION; i++) { 784 for (i = 0; i < PAGES_PER_SECTION; i++) {
784 if (PageHWPoison(&memmap[i])) { 785 if (PageHWPoison(&memmap[i])) {
785 atomic_long_sub(1, &mce_bad_pages); 786 atomic_long_sub(1, &num_poisoned_pages);
786 ClearPageHWPoison(&memmap[i]); 787 ClearPageHWPoison(&memmap[i]);
787 } 788 }
788 } 789 }
@@ -796,8 +797,10 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
796void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) 797void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
797{ 798{
798 struct page *memmap = NULL; 799 struct page *memmap = NULL;
799 unsigned long *usemap = NULL; 800 unsigned long *usemap = NULL, flags;
801 struct pglist_data *pgdat = zone->zone_pgdat;
800 802
803 pgdat_resize_lock(pgdat, &flags);
801 if (ms->section_mem_map) { 804 if (ms->section_mem_map) {
802 usemap = ms->pageblock_flags; 805 usemap = ms->pageblock_flags;
803 memmap = sparse_decode_mem_map(ms->section_mem_map, 806 memmap = sparse_decode_mem_map(ms->section_mem_map,
@@ -805,6 +808,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
805 ms->section_mem_map = 0; 808 ms->section_mem_map = 0;
806 ms->pageblock_flags = NULL; 809 ms->pageblock_flags = NULL;
807 } 810 }
811 pgdat_resize_unlock(pgdat, &flags);
808 812
809 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); 813 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
810 free_section_usemap(memmap, usemap); 814 free_section_usemap(memmap, usemap);
diff --git a/mm/swap.c b/mm/swap.c
index 6310dc2008ff..8a529a01e8fc 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
855void __init swap_setup(void) 855void __init swap_setup(void)
856{ 856{
857 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); 857 unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
858
859#ifdef CONFIG_SWAP 858#ifdef CONFIG_SWAP
860 bdi_init(swapper_space.backing_dev_info); 859 int i;
860
861 bdi_init(swapper_spaces[0].backing_dev_info);
862 for (i = 0; i < MAX_SWAPFILES; i++) {
863 spin_lock_init(&swapper_spaces[i].tree_lock);
864 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
865 }
861#endif 866#endif
862 867
863 /* Use a smaller cluster for small-memory machines */ 868 /* Use a smaller cluster for small-memory machines */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0cb36fb1f61c..7efcf1525921 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -36,12 +36,12 @@ static struct backing_dev_info swap_backing_dev_info = {
36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
37}; 37};
38 38
39struct address_space swapper_space = { 39struct address_space swapper_spaces[MAX_SWAPFILES] = {
40 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 40 [0 ... MAX_SWAPFILES - 1] = {
41 .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), 41 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
42 .a_ops = &swap_aops, 42 .a_ops = &swap_aops,
43 .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), 43 .backing_dev_info = &swap_backing_dev_info,
44 .backing_dev_info = &swap_backing_dev_info, 44 }
45}; 45};
46 46
47#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) 47#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
@@ -53,13 +53,24 @@ static struct {
53 unsigned long find_total; 53 unsigned long find_total;
54} swap_cache_info; 54} swap_cache_info;
55 55
56unsigned long total_swapcache_pages(void)
57{
58 int i;
59 unsigned long ret = 0;
60
61 for (i = 0; i < MAX_SWAPFILES; i++)
62 ret += swapper_spaces[i].nrpages;
63 return ret;
64}
65
56void show_swap_cache_info(void) 66void show_swap_cache_info(void)
57{ 67{
58 printk("%lu pages in swap cache\n", total_swapcache_pages); 68 printk("%lu pages in swap cache\n", total_swapcache_pages());
59 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", 69 printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
60 swap_cache_info.add_total, swap_cache_info.del_total, 70 swap_cache_info.add_total, swap_cache_info.del_total,
61 swap_cache_info.find_success, swap_cache_info.find_total); 71 swap_cache_info.find_success, swap_cache_info.find_total);
62 printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 72 printk("Free swap = %ldkB\n",
73 get_nr_swap_pages() << (PAGE_SHIFT - 10));
63 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 74 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
64} 75}
65 76
@@ -70,6 +81,7 @@ void show_swap_cache_info(void)
70static int __add_to_swap_cache(struct page *page, swp_entry_t entry) 81static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
71{ 82{
72 int error; 83 int error;
84 struct address_space *address_space;
73 85
74 VM_BUG_ON(!PageLocked(page)); 86 VM_BUG_ON(!PageLocked(page));
75 VM_BUG_ON(PageSwapCache(page)); 87 VM_BUG_ON(PageSwapCache(page));
@@ -79,14 +91,16 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
79 SetPageSwapCache(page); 91 SetPageSwapCache(page);
80 set_page_private(page, entry.val); 92 set_page_private(page, entry.val);
81 93
82 spin_lock_irq(&swapper_space.tree_lock); 94 address_space = swap_address_space(entry);
83 error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); 95 spin_lock_irq(&address_space->tree_lock);
96 error = radix_tree_insert(&address_space->page_tree,
97 entry.val, page);
84 if (likely(!error)) { 98 if (likely(!error)) {
85 total_swapcache_pages++; 99 address_space->nrpages++;
86 __inc_zone_page_state(page, NR_FILE_PAGES); 100 __inc_zone_page_state(page, NR_FILE_PAGES);
87 INC_CACHE_INFO(add_total); 101 INC_CACHE_INFO(add_total);
88 } 102 }
89 spin_unlock_irq(&swapper_space.tree_lock); 103 spin_unlock_irq(&address_space->tree_lock);
90 104
91 if (unlikely(error)) { 105 if (unlikely(error)) {
92 /* 106 /*
@@ -122,14 +136,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
122 */ 136 */
123void __delete_from_swap_cache(struct page *page) 137void __delete_from_swap_cache(struct page *page)
124{ 138{
139 swp_entry_t entry;
140 struct address_space *address_space;
141
125 VM_BUG_ON(!PageLocked(page)); 142 VM_BUG_ON(!PageLocked(page));
126 VM_BUG_ON(!PageSwapCache(page)); 143 VM_BUG_ON(!PageSwapCache(page));
127 VM_BUG_ON(PageWriteback(page)); 144 VM_BUG_ON(PageWriteback(page));
128 145
129 radix_tree_delete(&swapper_space.page_tree, page_private(page)); 146 entry.val = page_private(page);
147 address_space = swap_address_space(entry);
148 radix_tree_delete(&address_space->page_tree, page_private(page));
130 set_page_private(page, 0); 149 set_page_private(page, 0);
131 ClearPageSwapCache(page); 150 ClearPageSwapCache(page);
132 total_swapcache_pages--; 151 address_space->nrpages--;
133 __dec_zone_page_state(page, NR_FILE_PAGES); 152 __dec_zone_page_state(page, NR_FILE_PAGES);
134 INC_CACHE_INFO(del_total); 153 INC_CACHE_INFO(del_total);
135} 154}
@@ -195,12 +214,14 @@ int add_to_swap(struct page *page)
195void delete_from_swap_cache(struct page *page) 214void delete_from_swap_cache(struct page *page)
196{ 215{
197 swp_entry_t entry; 216 swp_entry_t entry;
217 struct address_space *address_space;
198 218
199 entry.val = page_private(page); 219 entry.val = page_private(page);
200 220
201 spin_lock_irq(&swapper_space.tree_lock); 221 address_space = swap_address_space(entry);
222 spin_lock_irq(&address_space->tree_lock);
202 __delete_from_swap_cache(page); 223 __delete_from_swap_cache(page);
203 spin_unlock_irq(&swapper_space.tree_lock); 224 spin_unlock_irq(&address_space->tree_lock);
204 225
205 swapcache_free(entry, page); 226 swapcache_free(entry, page);
206 page_cache_release(page); 227 page_cache_release(page);
@@ -263,7 +284,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
263{ 284{
264 struct page *page; 285 struct page *page;
265 286
266 page = find_get_page(&swapper_space, entry.val); 287 page = find_get_page(swap_address_space(entry), entry.val);
267 288
268 if (page) 289 if (page)
269 INC_CACHE_INFO(find_success); 290 INC_CACHE_INFO(find_success);
@@ -290,7 +311,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
290 * called after lookup_swap_cache() failed, re-calling 311 * called after lookup_swap_cache() failed, re-calling
291 * that would confuse statistics. 312 * that would confuse statistics.
292 */ 313 */
293 found_page = find_get_page(&swapper_space, entry.val); 314 found_page = find_get_page(swap_address_space(entry),
315 entry.val);
294 if (found_page) 316 if (found_page)
295 break; 317 break;
296 318
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e97a0e5aea91..c72c648f750c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,9 +47,11 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**);
47 47
48DEFINE_SPINLOCK(swap_lock); 48DEFINE_SPINLOCK(swap_lock);
49static unsigned int nr_swapfiles; 49static unsigned int nr_swapfiles;
50long nr_swap_pages; 50atomic_long_t nr_swap_pages;
51/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
51long total_swap_pages; 52long total_swap_pages;
52static int least_priority; 53static int least_priority;
54static atomic_t highest_priority_index = ATOMIC_INIT(-1);
53 55
54static const char Bad_file[] = "Bad swap file entry "; 56static const char Bad_file[] = "Bad swap file entry ";
55static const char Unused_file[] = "Unused swap file entry "; 57static const char Unused_file[] = "Unused swap file entry ";
@@ -79,7 +81,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
79 struct page *page; 81 struct page *page;
80 int ret = 0; 82 int ret = 0;
81 83
82 page = find_get_page(&swapper_space, entry.val); 84 page = find_get_page(swap_address_space(entry), entry.val);
83 if (!page) 85 if (!page)
84 return 0; 86 return 0;
85 /* 87 /*
@@ -223,7 +225,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
223 si->lowest_alloc = si->max; 225 si->lowest_alloc = si->max;
224 si->highest_alloc = 0; 226 si->highest_alloc = 0;
225 } 227 }
226 spin_unlock(&swap_lock); 228 spin_unlock(&si->lock);
227 229
228 /* 230 /*
229 * If seek is expensive, start searching for new cluster from 231 * If seek is expensive, start searching for new cluster from
@@ -242,7 +244,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
242 if (si->swap_map[offset]) 244 if (si->swap_map[offset])
243 last_in_cluster = offset + SWAPFILE_CLUSTER; 245 last_in_cluster = offset + SWAPFILE_CLUSTER;
244 else if (offset == last_in_cluster) { 246 else if (offset == last_in_cluster) {
245 spin_lock(&swap_lock); 247 spin_lock(&si->lock);
246 offset -= SWAPFILE_CLUSTER - 1; 248 offset -= SWAPFILE_CLUSTER - 1;
247 si->cluster_next = offset; 249 si->cluster_next = offset;
248 si->cluster_nr = SWAPFILE_CLUSTER - 1; 250 si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -263,7 +265,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
263 if (si->swap_map[offset]) 265 if (si->swap_map[offset])
264 last_in_cluster = offset + SWAPFILE_CLUSTER; 266 last_in_cluster = offset + SWAPFILE_CLUSTER;
265 else if (offset == last_in_cluster) { 267 else if (offset == last_in_cluster) {
266 spin_lock(&swap_lock); 268 spin_lock(&si->lock);
267 offset -= SWAPFILE_CLUSTER - 1; 269 offset -= SWAPFILE_CLUSTER - 1;
268 si->cluster_next = offset; 270 si->cluster_next = offset;
269 si->cluster_nr = SWAPFILE_CLUSTER - 1; 271 si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -277,7 +279,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
277 } 279 }
278 280
279 offset = scan_base; 281 offset = scan_base;
280 spin_lock(&swap_lock); 282 spin_lock(&si->lock);
281 si->cluster_nr = SWAPFILE_CLUSTER - 1; 283 si->cluster_nr = SWAPFILE_CLUSTER - 1;
282 si->lowest_alloc = 0; 284 si->lowest_alloc = 0;
283 } 285 }
@@ -293,9 +295,9 @@ checks:
293 /* reuse swap entry of cache-only swap if not busy. */ 295 /* reuse swap entry of cache-only swap if not busy. */
294 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 296 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
295 int swap_was_freed; 297 int swap_was_freed;
296 spin_unlock(&swap_lock); 298 spin_unlock(&si->lock);
297 swap_was_freed = __try_to_reclaim_swap(si, offset); 299 swap_was_freed = __try_to_reclaim_swap(si, offset);
298 spin_lock(&swap_lock); 300 spin_lock(&si->lock);
299 /* entry was freed successfully, try to use this again */ 301 /* entry was freed successfully, try to use this again */
300 if (swap_was_freed) 302 if (swap_was_freed)
301 goto checks; 303 goto checks;
@@ -335,13 +337,13 @@ checks:
335 si->lowest_alloc <= last_in_cluster) 337 si->lowest_alloc <= last_in_cluster)
336 last_in_cluster = si->lowest_alloc - 1; 338 last_in_cluster = si->lowest_alloc - 1;
337 si->flags |= SWP_DISCARDING; 339 si->flags |= SWP_DISCARDING;
338 spin_unlock(&swap_lock); 340 spin_unlock(&si->lock);
339 341
340 if (offset < last_in_cluster) 342 if (offset < last_in_cluster)
341 discard_swap_cluster(si, offset, 343 discard_swap_cluster(si, offset,
342 last_in_cluster - offset + 1); 344 last_in_cluster - offset + 1);
343 345
344 spin_lock(&swap_lock); 346 spin_lock(&si->lock);
345 si->lowest_alloc = 0; 347 si->lowest_alloc = 0;
346 si->flags &= ~SWP_DISCARDING; 348 si->flags &= ~SWP_DISCARDING;
347 349
@@ -355,10 +357,10 @@ checks:
355 * could defer that delay until swap_writepage, 357 * could defer that delay until swap_writepage,
356 * but it's easier to keep this self-contained. 358 * but it's easier to keep this self-contained.
357 */ 359 */
358 spin_unlock(&swap_lock); 360 spin_unlock(&si->lock);
359 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), 361 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
360 wait_for_discard, TASK_UNINTERRUPTIBLE); 362 wait_for_discard, TASK_UNINTERRUPTIBLE);
361 spin_lock(&swap_lock); 363 spin_lock(&si->lock);
362 } else { 364 } else {
363 /* 365 /*
364 * Note pages allocated by racing tasks while 366 * Note pages allocated by racing tasks while
@@ -374,14 +376,14 @@ checks:
374 return offset; 376 return offset;
375 377
376scan: 378scan:
377 spin_unlock(&swap_lock); 379 spin_unlock(&si->lock);
378 while (++offset <= si->highest_bit) { 380 while (++offset <= si->highest_bit) {
379 if (!si->swap_map[offset]) { 381 if (!si->swap_map[offset]) {
380 spin_lock(&swap_lock); 382 spin_lock(&si->lock);
381 goto checks; 383 goto checks;
382 } 384 }
383 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 385 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
384 spin_lock(&swap_lock); 386 spin_lock(&si->lock);
385 goto checks; 387 goto checks;
386 } 388 }
387 if (unlikely(--latency_ration < 0)) { 389 if (unlikely(--latency_ration < 0)) {
@@ -392,11 +394,11 @@ scan:
392 offset = si->lowest_bit; 394 offset = si->lowest_bit;
393 while (++offset < scan_base) { 395 while (++offset < scan_base) {
394 if (!si->swap_map[offset]) { 396 if (!si->swap_map[offset]) {
395 spin_lock(&swap_lock); 397 spin_lock(&si->lock);
396 goto checks; 398 goto checks;
397 } 399 }
398 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { 400 if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
399 spin_lock(&swap_lock); 401 spin_lock(&si->lock);
400 goto checks; 402 goto checks;
401 } 403 }
402 if (unlikely(--latency_ration < 0)) { 404 if (unlikely(--latency_ration < 0)) {
@@ -404,7 +406,7 @@ scan:
404 latency_ration = LATENCY_LIMIT; 406 latency_ration = LATENCY_LIMIT;
405 } 407 }
406 } 408 }
407 spin_lock(&swap_lock); 409 spin_lock(&si->lock);
408 410
409no_page: 411no_page:
410 si->flags -= SWP_SCANNING; 412 si->flags -= SWP_SCANNING;
@@ -417,13 +419,34 @@ swp_entry_t get_swap_page(void)
417 pgoff_t offset; 419 pgoff_t offset;
418 int type, next; 420 int type, next;
419 int wrapped = 0; 421 int wrapped = 0;
422 int hp_index;
420 423
421 spin_lock(&swap_lock); 424 spin_lock(&swap_lock);
422 if (nr_swap_pages <= 0) 425 if (atomic_long_read(&nr_swap_pages) <= 0)
423 goto noswap; 426 goto noswap;
424 nr_swap_pages--; 427 atomic_long_dec(&nr_swap_pages);
425 428
426 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { 429 for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
430 hp_index = atomic_xchg(&highest_priority_index, -1);
431 /*
432 * highest_priority_index records current highest priority swap
433 * type which just frees swap entries. If its priority is
434 * higher than that of swap_list.next swap type, we use it. It
435 * isn't protected by swap_lock, so it can be an invalid value
436 * if the corresponding swap type is swapoff. We double check
437 * the flags here. It's even possible the swap type is swapoff
438 * and swapon again and its priority is changed. In such rare
439 * case, low prority swap type might be used, but eventually
440 * high priority swap will be used after several rounds of
441 * swap.
442 */
443 if (hp_index != -1 && hp_index != type &&
444 swap_info[type]->prio < swap_info[hp_index]->prio &&
445 (swap_info[hp_index]->flags & SWP_WRITEOK)) {
446 type = hp_index;
447 swap_list.next = type;
448 }
449
427 si = swap_info[type]; 450 si = swap_info[type];
428 next = si->next; 451 next = si->next;
429 if (next < 0 || 452 if (next < 0 ||
@@ -432,22 +455,29 @@ swp_entry_t get_swap_page(void)
432 wrapped++; 455 wrapped++;
433 } 456 }
434 457
435 if (!si->highest_bit) 458 spin_lock(&si->lock);
459 if (!si->highest_bit) {
460 spin_unlock(&si->lock);
436 continue; 461 continue;
437 if (!(si->flags & SWP_WRITEOK)) 462 }
463 if (!(si->flags & SWP_WRITEOK)) {
464 spin_unlock(&si->lock);
438 continue; 465 continue;
466 }
439 467
440 swap_list.next = next; 468 swap_list.next = next;
469
470 spin_unlock(&swap_lock);
441 /* This is called for allocating swap entry for cache */ 471 /* This is called for allocating swap entry for cache */
442 offset = scan_swap_map(si, SWAP_HAS_CACHE); 472 offset = scan_swap_map(si, SWAP_HAS_CACHE);
443 if (offset) { 473 spin_unlock(&si->lock);
444 spin_unlock(&swap_lock); 474 if (offset)
445 return swp_entry(type, offset); 475 return swp_entry(type, offset);
446 } 476 spin_lock(&swap_lock);
447 next = swap_list.next; 477 next = swap_list.next;
448 } 478 }
449 479
450 nr_swap_pages++; 480 atomic_long_inc(&nr_swap_pages);
451noswap: 481noswap:
452 spin_unlock(&swap_lock); 482 spin_unlock(&swap_lock);
453 return (swp_entry_t) {0}; 483 return (swp_entry_t) {0};
@@ -459,19 +489,19 @@ swp_entry_t get_swap_page_of_type(int type)
459 struct swap_info_struct *si; 489 struct swap_info_struct *si;
460 pgoff_t offset; 490 pgoff_t offset;
461 491
462 spin_lock(&swap_lock);
463 si = swap_info[type]; 492 si = swap_info[type];
493 spin_lock(&si->lock);
464 if (si && (si->flags & SWP_WRITEOK)) { 494 if (si && (si->flags & SWP_WRITEOK)) {
465 nr_swap_pages--; 495 atomic_long_dec(&nr_swap_pages);
466 /* This is called for allocating swap entry, not cache */ 496 /* This is called for allocating swap entry, not cache */
467 offset = scan_swap_map(si, 1); 497 offset = scan_swap_map(si, 1);
468 if (offset) { 498 if (offset) {
469 spin_unlock(&swap_lock); 499 spin_unlock(&si->lock);
470 return swp_entry(type, offset); 500 return swp_entry(type, offset);
471 } 501 }
472 nr_swap_pages++; 502 atomic_long_inc(&nr_swap_pages);
473 } 503 }
474 spin_unlock(&swap_lock); 504 spin_unlock(&si->lock);
475 return (swp_entry_t) {0}; 505 return (swp_entry_t) {0};
476} 506}
477 507
@@ -493,7 +523,7 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
493 goto bad_offset; 523 goto bad_offset;
494 if (!p->swap_map[offset]) 524 if (!p->swap_map[offset])
495 goto bad_free; 525 goto bad_free;
496 spin_lock(&swap_lock); 526 spin_lock(&p->lock);
497 return p; 527 return p;
498 528
499bad_free: 529bad_free:
@@ -511,6 +541,27 @@ out:
511 return NULL; 541 return NULL;
512} 542}
513 543
544/*
545 * This swap type frees swap entry, check if it is the highest priority swap
546 * type which just frees swap entry. get_swap_page() uses
547 * highest_priority_index to search highest priority swap type. The
548 * swap_info_struct.lock can't protect us if there are multiple swap types
549 * active, so we use atomic_cmpxchg.
550 */
551static void set_highest_priority_index(int type)
552{
553 int old_hp_index, new_hp_index;
554
555 do {
556 old_hp_index = atomic_read(&highest_priority_index);
557 if (old_hp_index != -1 &&
558 swap_info[old_hp_index]->prio >= swap_info[type]->prio)
559 break;
560 new_hp_index = type;
561 } while (atomic_cmpxchg(&highest_priority_index,
562 old_hp_index, new_hp_index) != old_hp_index);
563}
564
514static unsigned char swap_entry_free(struct swap_info_struct *p, 565static unsigned char swap_entry_free(struct swap_info_struct *p,
515 swp_entry_t entry, unsigned char usage) 566 swp_entry_t entry, unsigned char usage)
516{ 567{
@@ -553,10 +604,8 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
553 p->lowest_bit = offset; 604 p->lowest_bit = offset;
554 if (offset > p->highest_bit) 605 if (offset > p->highest_bit)
555 p->highest_bit = offset; 606 p->highest_bit = offset;
556 if (swap_list.next >= 0 && 607 set_highest_priority_index(p->type);
557 p->prio > swap_info[swap_list.next]->prio) 608 atomic_long_inc(&nr_swap_pages);
558 swap_list.next = p->type;
559 nr_swap_pages++;
560 p->inuse_pages--; 609 p->inuse_pages--;
561 frontswap_invalidate_page(p->type, offset); 610 frontswap_invalidate_page(p->type, offset);
562 if (p->flags & SWP_BLKDEV) { 611 if (p->flags & SWP_BLKDEV) {
@@ -581,7 +630,7 @@ void swap_free(swp_entry_t entry)
581 p = swap_info_get(entry); 630 p = swap_info_get(entry);
582 if (p) { 631 if (p) {
583 swap_entry_free(p, entry, 1); 632 swap_entry_free(p, entry, 1);
584 spin_unlock(&swap_lock); 633 spin_unlock(&p->lock);
585 } 634 }
586} 635}
587 636
@@ -598,7 +647,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
598 count = swap_entry_free(p, entry, SWAP_HAS_CACHE); 647 count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
599 if (page) 648 if (page)
600 mem_cgroup_uncharge_swapcache(page, entry, count != 0); 649 mem_cgroup_uncharge_swapcache(page, entry, count != 0);
601 spin_unlock(&swap_lock); 650 spin_unlock(&p->lock);
602 } 651 }
603} 652}
604 653
@@ -617,7 +666,7 @@ int page_swapcount(struct page *page)
617 p = swap_info_get(entry); 666 p = swap_info_get(entry);
618 if (p) { 667 if (p) {
619 count = swap_count(p->swap_map[swp_offset(entry)]); 668 count = swap_count(p->swap_map[swp_offset(entry)]);
620 spin_unlock(&swap_lock); 669 spin_unlock(&p->lock);
621 } 670 }
622 return count; 671 return count;
623} 672}
@@ -699,13 +748,14 @@ int free_swap_and_cache(swp_entry_t entry)
699 p = swap_info_get(entry); 748 p = swap_info_get(entry);
700 if (p) { 749 if (p) {
701 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { 750 if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
702 page = find_get_page(&swapper_space, entry.val); 751 page = find_get_page(swap_address_space(entry),
752 entry.val);
703 if (page && !trylock_page(page)) { 753 if (page && !trylock_page(page)) {
704 page_cache_release(page); 754 page_cache_release(page);
705 page = NULL; 755 page = NULL;
706 } 756 }
707 } 757 }
708 spin_unlock(&swap_lock); 758 spin_unlock(&p->lock);
709 } 759 }
710 if (page) { 760 if (page) {
711 /* 761 /*
@@ -803,11 +853,13 @@ unsigned int count_swap_pages(int type, int free)
803 if ((unsigned int)type < nr_swapfiles) { 853 if ((unsigned int)type < nr_swapfiles) {
804 struct swap_info_struct *sis = swap_info[type]; 854 struct swap_info_struct *sis = swap_info[type];
805 855
856 spin_lock(&sis->lock);
806 if (sis->flags & SWP_WRITEOK) { 857 if (sis->flags & SWP_WRITEOK) {
807 n = sis->pages; 858 n = sis->pages;
808 if (free) 859 if (free)
809 n -= sis->inuse_pages; 860 n -= sis->inuse_pages;
810 } 861 }
862 spin_unlock(&sis->lock);
811 } 863 }
812 spin_unlock(&swap_lock); 864 spin_unlock(&swap_lock);
813 return n; 865 return n;
@@ -822,11 +874,17 @@ unsigned int count_swap_pages(int type, int free)
822static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 874static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
823 unsigned long addr, swp_entry_t entry, struct page *page) 875 unsigned long addr, swp_entry_t entry, struct page *page)
824{ 876{
877 struct page *swapcache;
825 struct mem_cgroup *memcg; 878 struct mem_cgroup *memcg;
826 spinlock_t *ptl; 879 spinlock_t *ptl;
827 pte_t *pte; 880 pte_t *pte;
828 int ret = 1; 881 int ret = 1;
829 882
883 swapcache = page;
884 page = ksm_might_need_to_copy(page, vma, addr);
885 if (unlikely(!page))
886 return -ENOMEM;
887
830 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, 888 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
831 GFP_KERNEL, &memcg)) { 889 GFP_KERNEL, &memcg)) {
832 ret = -ENOMEM; 890 ret = -ENOMEM;
@@ -845,7 +903,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
845 get_page(page); 903 get_page(page);
846 set_pte_at(vma->vm_mm, addr, pte, 904 set_pte_at(vma->vm_mm, addr, pte,
847 pte_mkold(mk_pte(page, vma->vm_page_prot))); 905 pte_mkold(mk_pte(page, vma->vm_page_prot)));
848 page_add_anon_rmap(page, vma, addr); 906 if (page == swapcache)
907 page_add_anon_rmap(page, vma, addr);
908 else /* ksm created a completely new copy */
909 page_add_new_anon_rmap(page, vma, addr);
849 mem_cgroup_commit_charge_swapin(page, memcg); 910 mem_cgroup_commit_charge_swapin(page, memcg);
850 swap_free(entry); 911 swap_free(entry);
851 /* 912 /*
@@ -856,6 +917,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
856out: 917out:
857 pte_unmap_unlock(pte, ptl); 918 pte_unmap_unlock(pte, ptl);
858out_nolock: 919out_nolock:
920 if (page != swapcache) {
921 unlock_page(page);
922 put_page(page);
923 }
859 return ret; 924 return ret;
860} 925}
861 926
@@ -1456,7 +1521,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1456 p->swap_map = swap_map; 1521 p->swap_map = swap_map;
1457 frontswap_map_set(p, frontswap_map); 1522 frontswap_map_set(p, frontswap_map);
1458 p->flags |= SWP_WRITEOK; 1523 p->flags |= SWP_WRITEOK;
1459 nr_swap_pages += p->pages; 1524 atomic_long_add(p->pages, &nr_swap_pages);
1460 total_swap_pages += p->pages; 1525 total_swap_pages += p->pages;
1461 1526
1462 /* insert swap space into swap_list: */ 1527 /* insert swap space into swap_list: */
@@ -1478,15 +1543,19 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1478 unsigned long *frontswap_map) 1543 unsigned long *frontswap_map)
1479{ 1544{
1480 spin_lock(&swap_lock); 1545 spin_lock(&swap_lock);
1546 spin_lock(&p->lock);
1481 _enable_swap_info(p, prio, swap_map, frontswap_map); 1547 _enable_swap_info(p, prio, swap_map, frontswap_map);
1482 frontswap_init(p->type); 1548 frontswap_init(p->type);
1549 spin_unlock(&p->lock);
1483 spin_unlock(&swap_lock); 1550 spin_unlock(&swap_lock);
1484} 1551}
1485 1552
1486static void reinsert_swap_info(struct swap_info_struct *p) 1553static void reinsert_swap_info(struct swap_info_struct *p)
1487{ 1554{
1488 spin_lock(&swap_lock); 1555 spin_lock(&swap_lock);
1556 spin_lock(&p->lock);
1489 _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); 1557 _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1558 spin_unlock(&p->lock);
1490 spin_unlock(&swap_lock); 1559 spin_unlock(&swap_lock);
1491} 1560}
1492 1561
@@ -1546,14 +1615,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1546 /* just pick something that's safe... */ 1615 /* just pick something that's safe... */
1547 swap_list.next = swap_list.head; 1616 swap_list.next = swap_list.head;
1548 } 1617 }
1618 spin_lock(&p->lock);
1549 if (p->prio < 0) { 1619 if (p->prio < 0) {
1550 for (i = p->next; i >= 0; i = swap_info[i]->next) 1620 for (i = p->next; i >= 0; i = swap_info[i]->next)
1551 swap_info[i]->prio = p->prio--; 1621 swap_info[i]->prio = p->prio--;
1552 least_priority++; 1622 least_priority++;
1553 } 1623 }
1554 nr_swap_pages -= p->pages; 1624 atomic_long_sub(p->pages, &nr_swap_pages);
1555 total_swap_pages -= p->pages; 1625 total_swap_pages -= p->pages;
1556 p->flags &= ~SWP_WRITEOK; 1626 p->flags &= ~SWP_WRITEOK;
1627 spin_unlock(&p->lock);
1557 spin_unlock(&swap_lock); 1628 spin_unlock(&swap_lock);
1558 1629
1559 set_current_oom_origin(); 1630 set_current_oom_origin();
@@ -1572,14 +1643,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1572 1643
1573 mutex_lock(&swapon_mutex); 1644 mutex_lock(&swapon_mutex);
1574 spin_lock(&swap_lock); 1645 spin_lock(&swap_lock);
1646 spin_lock(&p->lock);
1575 drain_mmlist(); 1647 drain_mmlist();
1576 1648
1577 /* wait for anyone still in scan_swap_map */ 1649 /* wait for anyone still in scan_swap_map */
1578 p->highest_bit = 0; /* cuts scans short */ 1650 p->highest_bit = 0; /* cuts scans short */
1579 while (p->flags >= SWP_SCANNING) { 1651 while (p->flags >= SWP_SCANNING) {
1652 spin_unlock(&p->lock);
1580 spin_unlock(&swap_lock); 1653 spin_unlock(&swap_lock);
1581 schedule_timeout_uninterruptible(1); 1654 schedule_timeout_uninterruptible(1);
1582 spin_lock(&swap_lock); 1655 spin_lock(&swap_lock);
1656 spin_lock(&p->lock);
1583 } 1657 }
1584 1658
1585 swap_file = p->swap_file; 1659 swap_file = p->swap_file;
@@ -1589,6 +1663,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1589 p->swap_map = NULL; 1663 p->swap_map = NULL;
1590 p->flags = 0; 1664 p->flags = 0;
1591 frontswap_invalidate_area(type); 1665 frontswap_invalidate_area(type);
1666 spin_unlock(&p->lock);
1592 spin_unlock(&swap_lock); 1667 spin_unlock(&swap_lock);
1593 mutex_unlock(&swapon_mutex); 1668 mutex_unlock(&swapon_mutex);
1594 vfree(swap_map); 1669 vfree(swap_map);
@@ -1794,6 +1869,7 @@ static struct swap_info_struct *alloc_swap_info(void)
1794 p->flags = SWP_USED; 1869 p->flags = SWP_USED;
1795 p->next = -1; 1870 p->next = -1;
1796 spin_unlock(&swap_lock); 1871 spin_unlock(&swap_lock);
1872 spin_lock_init(&p->lock);
1797 1873
1798 return p; 1874 return p;
1799} 1875}
@@ -2116,7 +2192,7 @@ void si_swapinfo(struct sysinfo *val)
2116 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) 2192 if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
2117 nr_to_be_unused += si->inuse_pages; 2193 nr_to_be_unused += si->inuse_pages;
2118 } 2194 }
2119 val->freeswap = nr_swap_pages + nr_to_be_unused; 2195 val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
2120 val->totalswap = total_swap_pages + nr_to_be_unused; 2196 val->totalswap = total_swap_pages + nr_to_be_unused;
2121 spin_unlock(&swap_lock); 2197 spin_unlock(&swap_lock);
2122} 2198}
@@ -2149,7 +2225,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2149 p = swap_info[type]; 2225 p = swap_info[type];
2150 offset = swp_offset(entry); 2226 offset = swp_offset(entry);
2151 2227
2152 spin_lock(&swap_lock); 2228 spin_lock(&p->lock);
2153 if (unlikely(offset >= p->max)) 2229 if (unlikely(offset >= p->max))
2154 goto unlock_out; 2230 goto unlock_out;
2155 2231
@@ -2184,7 +2260,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2184 p->swap_map[offset] = count | has_cache; 2260 p->swap_map[offset] = count | has_cache;
2185 2261
2186unlock_out: 2262unlock_out:
2187 spin_unlock(&swap_lock); 2263 spin_unlock(&p->lock);
2188out: 2264out:
2189 return err; 2265 return err;
2190 2266
@@ -2309,7 +2385,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2309 } 2385 }
2310 2386
2311 if (!page) { 2387 if (!page) {
2312 spin_unlock(&swap_lock); 2388 spin_unlock(&si->lock);
2313 return -ENOMEM; 2389 return -ENOMEM;
2314 } 2390 }
2315 2391
@@ -2357,7 +2433,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
2357 list_add_tail(&page->lru, &head->lru); 2433 list_add_tail(&page->lru, &head->lru);
2358 page = NULL; /* now it's attached, don't free it */ 2434 page = NULL; /* now it's attached, don't free it */
2359out: 2435out:
2360 spin_unlock(&swap_lock); 2436 spin_unlock(&si->lock);
2361outer: 2437outer:
2362 if (page) 2438 if (page)
2363 __free_page(page); 2439 __free_page(page);
diff --git a/mm/util.c b/mm/util.c
index c55e26b17d93..ab1424dbe2e6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -5,6 +5,8 @@
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/swap.h>
9#include <linux/swapops.h>
8#include <asm/uaccess.h> 10#include <asm/uaccess.h>
9 11
10#include "internal.h" 12#include "internal.h"
@@ -355,12 +357,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
355{ 357{
356 unsigned long ret; 358 unsigned long ret;
357 struct mm_struct *mm = current->mm; 359 struct mm_struct *mm = current->mm;
360 unsigned long populate;
358 361
359 ret = security_mmap_file(file, prot, flag); 362 ret = security_mmap_file(file, prot, flag);
360 if (!ret) { 363 if (!ret) {
361 down_write(&mm->mmap_sem); 364 down_write(&mm->mmap_sem);
362 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); 365 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
366 &populate);
363 up_write(&mm->mmap_sem); 367 up_write(&mm->mmap_sem);
368 if (populate)
369 mm_populate(ret, populate);
364 } 370 }
365 return ret; 371 return ret;
366} 372}
@@ -378,6 +384,24 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
378} 384}
379EXPORT_SYMBOL(vm_mmap); 385EXPORT_SYMBOL(vm_mmap);
380 386
387struct address_space *page_mapping(struct page *page)
388{
389 struct address_space *mapping = page->mapping;
390
391 VM_BUG_ON(PageSlab(page));
392#ifdef CONFIG_SWAP
393 if (unlikely(PageSwapCache(page))) {
394 swp_entry_t entry;
395
396 entry.val = page_private(page);
397 mapping = swap_address_space(entry);
398 } else
399#endif
400 if ((unsigned long)mapping & PAGE_MAPPING_ANON)
401 mapping = NULL;
402 return mapping;
403}
404
381/* Tracepoints definitions. */ 405/* Tracepoints definitions. */
382EXPORT_TRACEPOINT_SYMBOL(kmalloc); 406EXPORT_TRACEPOINT_SYMBOL(kmalloc);
383EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); 407EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5123a169ab7b..0f751f2068c3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1376,8 +1376,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1376struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1376struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1377 unsigned long start, unsigned long end) 1377 unsigned long start, unsigned long end)
1378{ 1378{
1379 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1379 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
1380 __builtin_return_address(0)); 1380 GFP_KERNEL, __builtin_return_address(0));
1381} 1381}
1382EXPORT_SYMBOL_GPL(__get_vm_area); 1382EXPORT_SYMBOL_GPL(__get_vm_area);
1383 1383
@@ -1385,8 +1385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1385 unsigned long start, unsigned long end, 1385 unsigned long start, unsigned long end,
1386 const void *caller) 1386 const void *caller)
1387{ 1387{
1388 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1388 return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
1389 caller); 1389 GFP_KERNEL, caller);
1390} 1390}
1391 1391
1392/** 1392/**
@@ -1401,14 +1401,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1401struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 1401struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1402{ 1402{
1403 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1403 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1404 -1, GFP_KERNEL, __builtin_return_address(0)); 1404 NUMA_NO_NODE, GFP_KERNEL,
1405 __builtin_return_address(0));
1405} 1406}
1406 1407
1407struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1408struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1408 const void *caller) 1409 const void *caller)
1409{ 1410{
1410 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1411 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1411 -1, GFP_KERNEL, caller); 1412 NUMA_NO_NODE, GFP_KERNEL, caller);
1412} 1413}
1413 1414
1414/** 1415/**
@@ -1650,7 +1651,7 @@ fail:
1650 * @end: vm area range end 1651 * @end: vm area range end
1651 * @gfp_mask: flags for the page level allocator 1652 * @gfp_mask: flags for the page level allocator
1652 * @prot: protection mask for the allocated pages 1653 * @prot: protection mask for the allocated pages
1653 * @node: node to use for allocation or -1 1654 * @node: node to use for allocation or NUMA_NO_NODE
1654 * @caller: caller's return address 1655 * @caller: caller's return address
1655 * 1656 *
1656 * Allocate enough pages to cover @size from the page level 1657 * Allocate enough pages to cover @size from the page level
@@ -1706,7 +1707,7 @@ fail:
1706 * @align: desired alignment 1707 * @align: desired alignment
1707 * @gfp_mask: flags for the page level allocator 1708 * @gfp_mask: flags for the page level allocator
1708 * @prot: protection mask for the allocated pages 1709 * @prot: protection mask for the allocated pages
1709 * @node: node to use for allocation or -1 1710 * @node: node to use for allocation or NUMA_NO_NODE
1710 * @caller: caller's return address 1711 * @caller: caller's return address
1711 * 1712 *
1712 * Allocate enough pages to cover @size from the page level 1713 * Allocate enough pages to cover @size from the page level
@@ -1723,7 +1724,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1723 1724
1724void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1725void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1725{ 1726{
1726 return __vmalloc_node(size, 1, gfp_mask, prot, -1, 1727 return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
1727 __builtin_return_address(0)); 1728 __builtin_return_address(0));
1728} 1729}
1729EXPORT_SYMBOL(__vmalloc); 1730EXPORT_SYMBOL(__vmalloc);
@@ -1746,7 +1747,8 @@ static inline void *__vmalloc_node_flags(unsigned long size,
1746 */ 1747 */
1747void *vmalloc(unsigned long size) 1748void *vmalloc(unsigned long size)
1748{ 1749{
1749 return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); 1750 return __vmalloc_node_flags(size, NUMA_NO_NODE,
1751 GFP_KERNEL | __GFP_HIGHMEM);
1750} 1752}
1751EXPORT_SYMBOL(vmalloc); 1753EXPORT_SYMBOL(vmalloc);
1752 1754
@@ -1762,7 +1764,7 @@ EXPORT_SYMBOL(vmalloc);
1762 */ 1764 */
1763void *vzalloc(unsigned long size) 1765void *vzalloc(unsigned long size)
1764{ 1766{
1765 return __vmalloc_node_flags(size, -1, 1767 return __vmalloc_node_flags(size, NUMA_NO_NODE,
1766 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); 1768 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
1767} 1769}
1768EXPORT_SYMBOL(vzalloc); 1770EXPORT_SYMBOL(vzalloc);
@@ -1781,7 +1783,8 @@ void *vmalloc_user(unsigned long size)
1781 1783
1782 ret = __vmalloc_node(size, SHMLBA, 1784 ret = __vmalloc_node(size, SHMLBA,
1783 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 1785 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1784 PAGE_KERNEL, -1, __builtin_return_address(0)); 1786 PAGE_KERNEL, NUMA_NO_NODE,
1787 __builtin_return_address(0));
1785 if (ret) { 1788 if (ret) {
1786 area = find_vm_area(ret); 1789 area = find_vm_area(ret);
1787 area->flags |= VM_USERMAP; 1790 area->flags |= VM_USERMAP;
@@ -1846,7 +1849,7 @@ EXPORT_SYMBOL(vzalloc_node);
1846void *vmalloc_exec(unsigned long size) 1849void *vmalloc_exec(unsigned long size)
1847{ 1850{
1848 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, 1851 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
1849 -1, __builtin_return_address(0)); 1852 NUMA_NO_NODE, __builtin_return_address(0));
1850} 1853}
1851 1854
1852#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 1855#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
@@ -1867,7 +1870,7 @@ void *vmalloc_exec(unsigned long size)
1867void *vmalloc_32(unsigned long size) 1870void *vmalloc_32(unsigned long size)
1868{ 1871{
1869 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, 1872 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
1870 -1, __builtin_return_address(0)); 1873 NUMA_NO_NODE, __builtin_return_address(0));
1871} 1874}
1872EXPORT_SYMBOL(vmalloc_32); 1875EXPORT_SYMBOL(vmalloc_32);
1873 1876
@@ -1884,7 +1887,7 @@ void *vmalloc_32_user(unsigned long size)
1884 void *ret; 1887 void *ret;
1885 1888
1886 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 1889 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1887 -1, __builtin_return_address(0)); 1890 NUMA_NO_NODE, __builtin_return_address(0));
1888 if (ret) { 1891 if (ret) {
1889 area = find_vm_area(ret); 1892 area = find_vm_area(ret);
1890 area->flags |= VM_USERMAP; 1893 area->flags |= VM_USERMAP;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 196709f5ee58..88c5fed8b9a4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -128,7 +128,7 @@ struct scan_control {
128 * From 0 .. 100. Higher means more swappy. 128 * From 0 .. 100. Higher means more swappy.
129 */ 129 */
130int vm_swappiness = 60; 130int vm_swappiness = 60;
131long vm_total_pages; /* The total number of pages which the VM controls */ 131unsigned long vm_total_pages; /* The total number of pages which the VM controls */
132 132
133static LIST_HEAD(shrinker_list); 133static LIST_HEAD(shrinker_list);
134static DECLARE_RWSEM(shrinker_rwsem); 134static DECLARE_RWSEM(shrinker_rwsem);
@@ -1579,16 +1579,6 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
1579} 1579}
1580#endif 1580#endif
1581 1581
1582static int inactive_file_is_low_global(struct zone *zone)
1583{
1584 unsigned long active, inactive;
1585
1586 active = zone_page_state(zone, NR_ACTIVE_FILE);
1587 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1588
1589 return (active > inactive);
1590}
1591
1592/** 1582/**
1593 * inactive_file_is_low - check if file pages need to be deactivated 1583 * inactive_file_is_low - check if file pages need to be deactivated
1594 * @lruvec: LRU vector to check 1584 * @lruvec: LRU vector to check
@@ -1605,10 +1595,13 @@ static int inactive_file_is_low_global(struct zone *zone)
1605 */ 1595 */
1606static int inactive_file_is_low(struct lruvec *lruvec) 1596static int inactive_file_is_low(struct lruvec *lruvec)
1607{ 1597{
1608 if (!mem_cgroup_disabled()) 1598 unsigned long inactive;
1609 return mem_cgroup_inactive_file_is_low(lruvec); 1599 unsigned long active;
1600
1601 inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1602 active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
1610 1603
1611 return inactive_file_is_low_global(lruvec_zone(lruvec)); 1604 return active > inactive;
1612} 1605}
1613 1606
1614static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) 1607static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
@@ -1638,6 +1631,13 @@ static int vmscan_swappiness(struct scan_control *sc)
1638 return mem_cgroup_swappiness(sc->target_mem_cgroup); 1631 return mem_cgroup_swappiness(sc->target_mem_cgroup);
1639} 1632}
1640 1633
1634enum scan_balance {
1635 SCAN_EQUAL,
1636 SCAN_FRACT,
1637 SCAN_ANON,
1638 SCAN_FILE,
1639};
1640
1641/* 1641/*
1642 * Determine how aggressively the anon and file LRU lists should be 1642 * Determine how aggressively the anon and file LRU lists should be
1643 * scanned. The relative value of each set of LRU lists is determined 1643 * scanned. The relative value of each set of LRU lists is determined
@@ -1650,15 +1650,16 @@ static int vmscan_swappiness(struct scan_control *sc)
1650static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 1650static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1651 unsigned long *nr) 1651 unsigned long *nr)
1652{ 1652{
1653 unsigned long anon, file, free; 1653 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1654 u64 fraction[2];
1655 u64 denominator = 0; /* gcc */
1656 struct zone *zone = lruvec_zone(lruvec);
1654 unsigned long anon_prio, file_prio; 1657 unsigned long anon_prio, file_prio;
1658 enum scan_balance scan_balance;
1659 unsigned long anon, file, free;
1660 bool force_scan = false;
1655 unsigned long ap, fp; 1661 unsigned long ap, fp;
1656 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1657 u64 fraction[2], denominator;
1658 enum lru_list lru; 1662 enum lru_list lru;
1659 int noswap = 0;
1660 bool force_scan = false;
1661 struct zone *zone = lruvec_zone(lruvec);
1662 1663
1663 /* 1664 /*
1664 * If the zone or memcg is small, nr[l] can be 0. This 1665 * If the zone or memcg is small, nr[l] can be 0. This
@@ -1676,11 +1677,30 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1676 force_scan = true; 1677 force_scan = true;
1677 1678
1678 /* If we have no swap space, do not bother scanning anon pages. */ 1679 /* If we have no swap space, do not bother scanning anon pages. */
1679 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1680 if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
1680 noswap = 1; 1681 scan_balance = SCAN_FILE;
1681 fraction[0] = 0; 1682 goto out;
1682 fraction[1] = 1; 1683 }
1683 denominator = 1; 1684
1685 /*
1686 * Global reclaim will swap to prevent OOM even with no
1687 * swappiness, but memcg users want to use this knob to
1688 * disable swapping for individual groups completely when
1689 * using the memory controller's swap limit feature would be
1690 * too expensive.
1691 */
1692 if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
1693 scan_balance = SCAN_FILE;
1694 goto out;
1695 }
1696
1697 /*
1698 * Do not apply any pressure balancing cleverness when the
1699 * system is close to OOM, scan both anon and file equally
1700 * (unless the swappiness setting disagrees with swapping).
1701 */
1702 if (!sc->priority && vmscan_swappiness(sc)) {
1703 scan_balance = SCAN_EQUAL;
1684 goto out; 1704 goto out;
1685 } 1705 }
1686 1706
@@ -1689,30 +1709,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1689 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + 1709 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1690 get_lru_size(lruvec, LRU_INACTIVE_FILE); 1710 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1691 1711
1712 /*
1713 * If it's foreseeable that reclaiming the file cache won't be
1714 * enough to get the zone back into a desirable shape, we have
1715 * to swap. Better start now and leave the - probably heavily
1716 * thrashing - remaining file pages alone.
1717 */
1692 if (global_reclaim(sc)) { 1718 if (global_reclaim(sc)) {
1693 free = zone_page_state(zone, NR_FREE_PAGES); 1719 free = zone_page_state(zone, NR_FREE_PAGES);
1694 if (unlikely(file + free <= high_wmark_pages(zone))) { 1720 if (unlikely(file + free <= high_wmark_pages(zone))) {
1695 /* 1721 scan_balance = SCAN_ANON;
1696 * If we have very few page cache pages, force-scan
1697 * anon pages.
1698 */
1699 fraction[0] = 1;
1700 fraction[1] = 0;
1701 denominator = 1;
1702 goto out;
1703 } else if (!inactive_file_is_low_global(zone)) {
1704 /*
1705 * There is enough inactive page cache, do not
1706 * reclaim anything from the working set right now.
1707 */
1708 fraction[0] = 0;
1709 fraction[1] = 1;
1710 denominator = 1;
1711 goto out; 1722 goto out;
1712 } 1723 }
1713 } 1724 }
1714 1725
1715 /* 1726 /*
1727 * There is enough inactive page cache, do not reclaim
1728 * anything from the anonymous working set right now.
1729 */
1730 if (!inactive_file_is_low(lruvec)) {
1731 scan_balance = SCAN_FILE;
1732 goto out;
1733 }
1734
1735 scan_balance = SCAN_FRACT;
1736
1737 /*
1716 * With swappiness at 100, anonymous and file have the same priority. 1738 * With swappiness at 100, anonymous and file have the same priority.
1717 * This scanning priority is essentially the inverse of IO cost. 1739 * This scanning priority is essentially the inverse of IO cost.
1718 */ 1740 */
@@ -1759,19 +1781,92 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1759out: 1781out:
1760 for_each_evictable_lru(lru) { 1782 for_each_evictable_lru(lru) {
1761 int file = is_file_lru(lru); 1783 int file = is_file_lru(lru);
1784 unsigned long size;
1762 unsigned long scan; 1785 unsigned long scan;
1763 1786
1764 scan = get_lru_size(lruvec, lru); 1787 size = get_lru_size(lruvec, lru);
1765 if (sc->priority || noswap || !vmscan_swappiness(sc)) { 1788 scan = size >> sc->priority;
1766 scan >>= sc->priority; 1789
1767 if (!scan && force_scan) 1790 if (!scan && force_scan)
1768 scan = SWAP_CLUSTER_MAX; 1791 scan = min(size, SWAP_CLUSTER_MAX);
1792
1793 switch (scan_balance) {
1794 case SCAN_EQUAL:
1795 /* Scan lists relative to size */
1796 break;
1797 case SCAN_FRACT:
1798 /*
1799 * Scan types proportional to swappiness and
1800 * their relative recent reclaim efficiency.
1801 */
1769 scan = div64_u64(scan * fraction[file], denominator); 1802 scan = div64_u64(scan * fraction[file], denominator);
1803 break;
1804 case SCAN_FILE:
1805 case SCAN_ANON:
1806 /* Scan one type exclusively */
1807 if ((scan_balance == SCAN_FILE) != file)
1808 scan = 0;
1809 break;
1810 default:
1811 /* Look ma, no brain */
1812 BUG();
1770 } 1813 }
1771 nr[lru] = scan; 1814 nr[lru] = scan;
1772 } 1815 }
1773} 1816}
1774 1817
1818/*
1819 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1820 */
1821static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1822{
1823 unsigned long nr[NR_LRU_LISTS];
1824 unsigned long nr_to_scan;
1825 enum lru_list lru;
1826 unsigned long nr_reclaimed = 0;
1827 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1828 struct blk_plug plug;
1829
1830 get_scan_count(lruvec, sc, nr);
1831
1832 blk_start_plug(&plug);
1833 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1834 nr[LRU_INACTIVE_FILE]) {
1835 for_each_evictable_lru(lru) {
1836 if (nr[lru]) {
1837 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
1838 nr[lru] -= nr_to_scan;
1839
1840 nr_reclaimed += shrink_list(lru, nr_to_scan,
1841 lruvec, sc);
1842 }
1843 }
1844 /*
1845 * On large memory systems, scan >> priority can become
1846 * really large. This is fine for the starting priority;
1847 * we want to put equal scanning pressure on each zone.
1848 * However, if the VM has a harder time of freeing pages,
1849 * with multiple processes reclaiming pages, the total
1850 * freeing target can get unreasonably large.
1851 */
1852 if (nr_reclaimed >= nr_to_reclaim &&
1853 sc->priority < DEF_PRIORITY)
1854 break;
1855 }
1856 blk_finish_plug(&plug);
1857 sc->nr_reclaimed += nr_reclaimed;
1858
1859 /*
1860 * Even if we did not try to evict anon pages at all, we want to
1861 * rebalance the anon lru active/inactive ratio.
1862 */
1863 if (inactive_anon_is_low(lruvec))
1864 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
1865 sc, LRU_ACTIVE_ANON);
1866
1867 throttle_vm_writeout(sc->gfp_mask);
1868}
1869
1775/* Use reclaim/compaction for costly allocs or under memory pressure */ 1870/* Use reclaim/compaction for costly allocs or under memory pressure */
1776static bool in_reclaim_compaction(struct scan_control *sc) 1871static bool in_reclaim_compaction(struct scan_control *sc)
1777{ 1872{
@@ -1790,7 +1885,7 @@ static bool in_reclaim_compaction(struct scan_control *sc)
1790 * calls try_to_compact_zone() that it will have enough free pages to succeed. 1885 * calls try_to_compact_zone() that it will have enough free pages to succeed.
1791 * It will give up earlier than that if there is difficulty reclaiming pages. 1886 * It will give up earlier than that if there is difficulty reclaiming pages.
1792 */ 1887 */
1793static inline bool should_continue_reclaim(struct lruvec *lruvec, 1888static inline bool should_continue_reclaim(struct zone *zone,
1794 unsigned long nr_reclaimed, 1889 unsigned long nr_reclaimed,
1795 unsigned long nr_scanned, 1890 unsigned long nr_scanned,
1796 struct scan_control *sc) 1891 struct scan_control *sc)
@@ -1830,15 +1925,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
1830 * inactive lists are large enough, continue reclaiming 1925 * inactive lists are large enough, continue reclaiming
1831 */ 1926 */
1832 pages_for_compaction = (2UL << sc->order); 1927 pages_for_compaction = (2UL << sc->order);
1833 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1928 inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
1834 if (nr_swap_pages > 0) 1929 if (get_nr_swap_pages() > 0)
1835 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); 1930 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
1836 if (sc->nr_reclaimed < pages_for_compaction && 1931 if (sc->nr_reclaimed < pages_for_compaction &&
1837 inactive_lru_pages > pages_for_compaction) 1932 inactive_lru_pages > pages_for_compaction)
1838 return true; 1933 return true;
1839 1934
1840 /* If compaction would go ahead or the allocation would succeed, stop */ 1935 /* If compaction would go ahead or the allocation would succeed, stop */
1841 switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) { 1936 switch (compaction_suitable(zone, sc->order)) {
1842 case COMPACT_PARTIAL: 1937 case COMPACT_PARTIAL:
1843 case COMPACT_CONTINUE: 1938 case COMPACT_CONTINUE:
1844 return false; 1939 return false;
@@ -1847,98 +1942,48 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
1847 } 1942 }
1848} 1943}
1849 1944
1850/* 1945static void shrink_zone(struct zone *zone, struct scan_control *sc)
1851 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1852 */
1853static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1854{ 1946{
1855 unsigned long nr[NR_LRU_LISTS];
1856 unsigned long nr_to_scan;
1857 enum lru_list lru;
1858 unsigned long nr_reclaimed, nr_scanned; 1947 unsigned long nr_reclaimed, nr_scanned;
1859 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1860 struct blk_plug plug;
1861
1862restart:
1863 nr_reclaimed = 0;
1864 nr_scanned = sc->nr_scanned;
1865 get_scan_count(lruvec, sc, nr);
1866
1867 blk_start_plug(&plug);
1868 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1869 nr[LRU_INACTIVE_FILE]) {
1870 for_each_evictable_lru(lru) {
1871 if (nr[lru]) {
1872 nr_to_scan = min_t(unsigned long,
1873 nr[lru], SWAP_CLUSTER_MAX);
1874 nr[lru] -= nr_to_scan;
1875
1876 nr_reclaimed += shrink_list(lru, nr_to_scan,
1877 lruvec, sc);
1878 }
1879 }
1880 /*
1881 * On large memory systems, scan >> priority can become
1882 * really large. This is fine for the starting priority;
1883 * we want to put equal scanning pressure on each zone.
1884 * However, if the VM has a harder time of freeing pages,
1885 * with multiple processes reclaiming pages, the total
1886 * freeing target can get unreasonably large.
1887 */
1888 if (nr_reclaimed >= nr_to_reclaim &&
1889 sc->priority < DEF_PRIORITY)
1890 break;
1891 }
1892 blk_finish_plug(&plug);
1893 sc->nr_reclaimed += nr_reclaimed;
1894 1948
1895 /* 1949 do {
1896 * Even if we did not try to evict anon pages at all, we want to 1950 struct mem_cgroup *root = sc->target_mem_cgroup;
1897 * rebalance the anon lru active/inactive ratio. 1951 struct mem_cgroup_reclaim_cookie reclaim = {
1898 */ 1952 .zone = zone,
1899 if (inactive_anon_is_low(lruvec)) 1953 .priority = sc->priority,
1900 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 1954 };
1901 sc, LRU_ACTIVE_ANON); 1955 struct mem_cgroup *memcg;
1902
1903 /* reclaim/compaction might need reclaim to continue */
1904 if (should_continue_reclaim(lruvec, nr_reclaimed,
1905 sc->nr_scanned - nr_scanned, sc))
1906 goto restart;
1907 1956
1908 throttle_vm_writeout(sc->gfp_mask); 1957 nr_reclaimed = sc->nr_reclaimed;
1909} 1958 nr_scanned = sc->nr_scanned;
1910 1959
1911static void shrink_zone(struct zone *zone, struct scan_control *sc) 1960 memcg = mem_cgroup_iter(root, NULL, &reclaim);
1912{ 1961 do {
1913 struct mem_cgroup *root = sc->target_mem_cgroup; 1962 struct lruvec *lruvec;
1914 struct mem_cgroup_reclaim_cookie reclaim = {
1915 .zone = zone,
1916 .priority = sc->priority,
1917 };
1918 struct mem_cgroup *memcg;
1919 1963
1920 memcg = mem_cgroup_iter(root, NULL, &reclaim); 1964 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
1921 do {
1922 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
1923 1965
1924 shrink_lruvec(lruvec, sc); 1966 shrink_lruvec(lruvec, sc);
1925 1967
1926 /* 1968 /*
1927 * Limit reclaim has historically picked one memcg and 1969 * Direct reclaim and kswapd have to scan all memory
1928 * scanned it with decreasing priority levels until 1970 * cgroups to fulfill the overall scan target for the
1929 * nr_to_reclaim had been reclaimed. This priority 1971 * zone.
1930 * cycle is thus over after a single memcg. 1972 *
1931 * 1973 * Limit reclaim, on the other hand, only cares about
1932 * Direct reclaim and kswapd, on the other hand, have 1974 * nr_to_reclaim pages to be reclaimed and it will
1933 * to scan all memory cgroups to fulfill the overall 1975 * retry with decreasing priority if one round over the
1934 * scan target for the zone. 1976 * whole hierarchy is not sufficient.
1935 */ 1977 */
1936 if (!global_reclaim(sc)) { 1978 if (!global_reclaim(sc) &&
1937 mem_cgroup_iter_break(root, memcg); 1979 sc->nr_reclaimed >= sc->nr_to_reclaim) {
1938 break; 1980 mem_cgroup_iter_break(root, memcg);
1939 } 1981 break;
1940 memcg = mem_cgroup_iter(root, memcg, &reclaim); 1982 }
1941 } while (memcg); 1983 memcg = mem_cgroup_iter(root, memcg, &reclaim);
1984 } while (memcg);
1985 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
1986 sc->nr_scanned - nr_scanned, sc));
1942} 1987}
1943 1988
1944/* Returns true if compaction should go ahead for a high-order request */ 1989/* Returns true if compaction should go ahead for a high-order request */
@@ -1958,7 +2003,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
1958 * a reasonable chance of completing and allocating the page 2003 * a reasonable chance of completing and allocating the page
1959 */ 2004 */
1960 balance_gap = min(low_wmark_pages(zone), 2005 balance_gap = min(low_wmark_pages(zone),
1961 (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2006 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
1962 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2007 KSWAPD_ZONE_BALANCE_GAP_RATIO);
1963 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); 2008 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
1964 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); 2009 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
@@ -2150,6 +2195,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2150 goto out; 2195 goto out;
2151 2196
2152 /* 2197 /*
2198 * If we're getting trouble reclaiming, start doing
2199 * writepage even in laptop mode.
2200 */
2201 if (sc->priority < DEF_PRIORITY - 2)
2202 sc->may_writepage = 1;
2203
2204 /*
2153 * Try to write back as many pages as we just scanned. This 2205 * Try to write back as many pages as we just scanned. This
2154 * tends to cause slow streaming writers to write data to the 2206 * tends to cause slow streaming writers to write data to the
2155 * disk smoothly, at the dirtying rate, which is nice. But 2207 * disk smoothly, at the dirtying rate, which is nice. But
@@ -2300,7 +2352,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2300{ 2352{
2301 unsigned long nr_reclaimed; 2353 unsigned long nr_reclaimed;
2302 struct scan_control sc = { 2354 struct scan_control sc = {
2303 .gfp_mask = gfp_mask, 2355 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
2304 .may_writepage = !laptop_mode, 2356 .may_writepage = !laptop_mode,
2305 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2357 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2306 .may_unmap = 1, 2358 .may_unmap = 1,
@@ -2473,7 +2525,7 @@ static bool zone_balanced(struct zone *zone, int order,
2473 */ 2525 */
2474static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) 2526static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2475{ 2527{
2476 unsigned long present_pages = 0; 2528 unsigned long managed_pages = 0;
2477 unsigned long balanced_pages = 0; 2529 unsigned long balanced_pages = 0;
2478 int i; 2530 int i;
2479 2531
@@ -2484,7 +2536,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2484 if (!populated_zone(zone)) 2536 if (!populated_zone(zone))
2485 continue; 2537 continue;
2486 2538
2487 present_pages += zone->present_pages; 2539 managed_pages += zone->managed_pages;
2488 2540
2489 /* 2541 /*
2490 * A special case here: 2542 * A special case here:
@@ -2494,18 +2546,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2494 * they must be considered balanced here as well! 2546 * they must be considered balanced here as well!
2495 */ 2547 */
2496 if (zone->all_unreclaimable) { 2548 if (zone->all_unreclaimable) {
2497 balanced_pages += zone->present_pages; 2549 balanced_pages += zone->managed_pages;
2498 continue; 2550 continue;
2499 } 2551 }
2500 2552
2501 if (zone_balanced(zone, order, 0, i)) 2553 if (zone_balanced(zone, order, 0, i))
2502 balanced_pages += zone->present_pages; 2554 balanced_pages += zone->managed_pages;
2503 else if (!order) 2555 else if (!order)
2504 return false; 2556 return false;
2505 } 2557 }
2506 2558
2507 if (order) 2559 if (order)
2508 return balanced_pages >= (present_pages >> 2); 2560 return balanced_pages >= (managed_pages >> 2);
2509 else 2561 else
2510 return true; 2562 return true;
2511} 2563}
@@ -2564,7 +2616,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2564static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2616static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2565 int *classzone_idx) 2617 int *classzone_idx)
2566{ 2618{
2567 struct zone *unbalanced_zone; 2619 bool pgdat_is_balanced = false;
2568 int i; 2620 int i;
2569 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2621 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2570 unsigned long total_scanned; 2622 unsigned long total_scanned;
@@ -2595,9 +2647,6 @@ loop_again:
2595 2647
2596 do { 2648 do {
2597 unsigned long lru_pages = 0; 2649 unsigned long lru_pages = 0;
2598 int has_under_min_watermark_zone = 0;
2599
2600 unbalanced_zone = NULL;
2601 2650
2602 /* 2651 /*
2603 * Scan in the highmem->dma direction for the highest 2652 * Scan in the highmem->dma direction for the highest
@@ -2638,8 +2687,11 @@ loop_again:
2638 zone_clear_flag(zone, ZONE_CONGESTED); 2687 zone_clear_flag(zone, ZONE_CONGESTED);
2639 } 2688 }
2640 } 2689 }
2641 if (i < 0) 2690
2691 if (i < 0) {
2692 pgdat_is_balanced = true;
2642 goto out; 2693 goto out;
2694 }
2643 2695
2644 for (i = 0; i <= end_zone; i++) { 2696 for (i = 0; i <= end_zone; i++) {
2645 struct zone *zone = pgdat->node_zones + i; 2697 struct zone *zone = pgdat->node_zones + i;
@@ -2689,7 +2741,7 @@ loop_again:
2689 * of the zone, whichever is smaller. 2741 * of the zone, whichever is smaller.
2690 */ 2742 */
2691 balance_gap = min(low_wmark_pages(zone), 2743 balance_gap = min(low_wmark_pages(zone),
2692 (zone->present_pages + 2744 (zone->managed_pages +
2693 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2745 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2694 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2746 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2695 /* 2747 /*
@@ -2720,12 +2772,10 @@ loop_again:
2720 } 2772 }
2721 2773
2722 /* 2774 /*
2723 * If we've done a decent amount of scanning and 2775 * If we're getting trouble reclaiming, start doing
2724 * the reclaim ratio is low, start doing writepage 2776 * writepage even in laptop mode.
2725 * even in laptop mode
2726 */ 2777 */
2727 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 2778 if (sc.priority < DEF_PRIORITY - 2)
2728 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2729 sc.may_writepage = 1; 2779 sc.may_writepage = 1;
2730 2780
2731 if (zone->all_unreclaimable) { 2781 if (zone->all_unreclaimable) {
@@ -2734,17 +2784,7 @@ loop_again:
2734 continue; 2784 continue;
2735 } 2785 }
2736 2786
2737 if (!zone_balanced(zone, testorder, 0, end_zone)) { 2787 if (zone_balanced(zone, testorder, 0, end_zone))
2738 unbalanced_zone = zone;
2739 /*
2740 * We are still under min water mark. This
2741 * means that we have a GFP_ATOMIC allocation
2742 * failure risk. Hurry up!
2743 */
2744 if (!zone_watermark_ok_safe(zone, order,
2745 min_wmark_pages(zone), end_zone, 0))
2746 has_under_min_watermark_zone = 1;
2747 } else {
2748 /* 2788 /*
2749 * If a zone reaches its high watermark, 2789 * If a zone reaches its high watermark,
2750 * consider it to be no longer congested. It's 2790 * consider it to be no longer congested. It's
@@ -2753,8 +2793,6 @@ loop_again:
2753 * speculatively avoid congestion waits 2793 * speculatively avoid congestion waits
2754 */ 2794 */
2755 zone_clear_flag(zone, ZONE_CONGESTED); 2795 zone_clear_flag(zone, ZONE_CONGESTED);
2756 }
2757
2758 } 2796 }
2759 2797
2760 /* 2798 /*
@@ -2766,17 +2804,9 @@ loop_again:
2766 pfmemalloc_watermark_ok(pgdat)) 2804 pfmemalloc_watermark_ok(pgdat))
2767 wake_up(&pgdat->pfmemalloc_wait); 2805 wake_up(&pgdat->pfmemalloc_wait);
2768 2806
2769 if (pgdat_balanced(pgdat, order, *classzone_idx)) 2807 if (pgdat_balanced(pgdat, order, *classzone_idx)) {
2808 pgdat_is_balanced = true;
2770 break; /* kswapd: all done */ 2809 break; /* kswapd: all done */
2771 /*
2772 * OK, kswapd is getting into trouble. Take a nap, then take
2773 * another pass across the zones.
2774 */
2775 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
2776 if (has_under_min_watermark_zone)
2777 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2778 else if (unbalanced_zone)
2779 wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
2780 } 2810 }
2781 2811
2782 /* 2812 /*
@@ -2788,9 +2818,9 @@ loop_again:
2788 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 2818 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2789 break; 2819 break;
2790 } while (--sc.priority >= 0); 2820 } while (--sc.priority >= 0);
2791out:
2792 2821
2793 if (!pgdat_balanced(pgdat, order, *classzone_idx)) { 2822out:
2823 if (!pgdat_is_balanced) {
2794 cond_resched(); 2824 cond_resched();
2795 2825
2796 try_to_freeze(); 2826 try_to_freeze();
@@ -3053,7 +3083,7 @@ unsigned long global_reclaimable_pages(void)
3053 nr = global_page_state(NR_ACTIVE_FILE) + 3083 nr = global_page_state(NR_ACTIVE_FILE) +
3054 global_page_state(NR_INACTIVE_FILE); 3084 global_page_state(NR_INACTIVE_FILE);
3055 3085
3056 if (nr_swap_pages > 0) 3086 if (get_nr_swap_pages() > 0)
3057 nr += global_page_state(NR_ACTIVE_ANON) + 3087 nr += global_page_state(NR_ACTIVE_ANON) +
3058 global_page_state(NR_INACTIVE_ANON); 3088 global_page_state(NR_INACTIVE_ANON);
3059 3089
@@ -3067,7 +3097,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
3067 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 3097 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
3068 zone_page_state(zone, NR_INACTIVE_FILE); 3098 zone_page_state(zone, NR_INACTIVE_FILE);
3069 3099
3070 if (nr_swap_pages > 0) 3100 if (get_nr_swap_pages() > 0)
3071 nr += zone_page_state(zone, NR_ACTIVE_ANON) + 3101 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
3072 zone_page_state(zone, NR_INACTIVE_ANON); 3102 zone_page_state(zone, NR_INACTIVE_ANON);
3073 3103
@@ -3280,9 +3310,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3280 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3310 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3281 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3311 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3282 .may_swap = 1, 3312 .may_swap = 1,
3283 .nr_to_reclaim = max_t(unsigned long, nr_pages, 3313 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3284 SWAP_CLUSTER_MAX), 3314 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
3285 .gfp_mask = gfp_mask,
3286 .order = order, 3315 .order = order,
3287 .priority = ZONE_RECLAIM_PRIORITY, 3316 .priority = ZONE_RECLAIM_PRIORITY,
3288 }; 3317 };
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9800306c8195..e1d8ed172c42 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -142,7 +142,7 @@ int calculate_normal_threshold(struct zone *zone)
142 * 125 1024 10 16-32 GB 9 142 * 125 1024 10 16-32 GB 9
143 */ 143 */
144 144
145 mem = zone->present_pages >> (27 - PAGE_SHIFT); 145 mem = zone->managed_pages >> (27 - PAGE_SHIFT);
146 146
147 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); 147 threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
148 148
@@ -628,7 +628,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
628#ifdef CONFIG_CMA 628#ifdef CONFIG_CMA
629 "CMA", 629 "CMA",
630#endif 630#endif
631#ifdef CONFIG_MEMORY_ISOLATION
631 "Isolate", 632 "Isolate",
633#endif
632}; 634};
633 635
634static void *frag_start(struct seq_file *m, loff_t *pos) 636static void *frag_start(struct seq_file *m, loff_t *pos)
@@ -768,7 +770,6 @@ const char * const vmstat_text[] = {
768 "kswapd_inodesteal", 770 "kswapd_inodesteal",
769 "kswapd_low_wmark_hit_quickly", 771 "kswapd_low_wmark_hit_quickly",
770 "kswapd_high_wmark_hit_quickly", 772 "kswapd_high_wmark_hit_quickly",
771 "kswapd_skip_congestion_wait",
772 "pageoutrun", 773 "pageoutrun",
773 "allocstall", 774 "allocstall",
774 775
@@ -890,7 +891,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
890 int mtype; 891 int mtype;
891 unsigned long pfn; 892 unsigned long pfn;
892 unsigned long start_pfn = zone->zone_start_pfn; 893 unsigned long start_pfn = zone->zone_start_pfn;
893 unsigned long end_pfn = start_pfn + zone->spanned_pages; 894 unsigned long end_pfn = zone_end_pfn(zone);
894 unsigned long count[MIGRATE_TYPES] = { 0, }; 895 unsigned long count[MIGRATE_TYPES] = { 0, };
895 896
896 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 897 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index fd05c81cb348..de2e950a0a7a 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -87,7 +87,7 @@ struct virtio_chan {
87 /* This is global limit. Since we don't have a global structure, 87 /* This is global limit. Since we don't have a global structure,
88 * will be placing it in each channel. 88 * will be placing it in each channel.
89 */ 89 */
90 int p9_max_pages; 90 unsigned long p9_max_pages;
91 /* Scatterlist: can be too big for stack. */ 91 /* Scatterlist: can be too big for stack. */
92 struct scatterlist sg[VIRTQUEUE_NUM]; 92 struct scatterlist sg[VIRTQUEUE_NUM];
93 93
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index a5b89a6fec6d..7427ab5e27d8 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -21,6 +21,7 @@
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include <linux/export.h> 22#include <linux/export.h>
23#include <linux/jiffies.h> 23#include <linux/jiffies.h>
24#include <linux/pm_runtime.h>
24 25
25#include "net-sysfs.h" 26#include "net-sysfs.h"
26 27
@@ -1257,6 +1258,8 @@ void netdev_unregister_kobject(struct net_device * net)
1257 1258
1258 remove_queue_kobjects(net); 1259 remove_queue_kobjects(net);
1259 1260
1261 pm_runtime_set_memalloc_noio(dev, false);
1262
1260 device_del(dev); 1263 device_del(dev);
1261} 1264}
1262 1265
@@ -1301,6 +1304,8 @@ int netdev_register_kobject(struct net_device *net)
1301 return error; 1304 return error;
1302 } 1305 }
1303 1306
1307 pm_runtime_set_memalloc_noio(dev, true);
1308
1304 return error; 1309 return error;
1305} 1310}
1306 1311