diff options
113 files changed, 4408 insertions, 1632 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm new file mode 100644 index 000000000000..73e653ee2481 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm | |||
@@ -0,0 +1,52 @@ | |||
1 | What: /sys/kernel/mm/ksm | ||
2 | Date: September 2009 | ||
3 | KernelVersion: 2.6.32 | ||
4 | Contact: Linux memory management mailing list <linux-mm@kvack.org> | ||
5 | Description: Interface for Kernel Samepage Merging (KSM) | ||
6 | |||
7 | What: /sys/kernel/mm/ksm/full_scans | ||
8 | What: /sys/kernel/mm/ksm/pages_shared | ||
9 | What: /sys/kernel/mm/ksm/pages_sharing | ||
10 | What: /sys/kernel/mm/ksm/pages_to_scan | ||
11 | What: /sys/kernel/mm/ksm/pages_unshared | ||
12 | What: /sys/kernel/mm/ksm/pages_volatile | ||
13 | What: /sys/kernel/mm/ksm/run | ||
14 | What: /sys/kernel/mm/ksm/sleep_millisecs | ||
15 | Date: September 2009 | ||
16 | Contact: Linux memory management mailing list <linux-mm@kvack.org> | ||
17 | Description: Kernel Samepage Merging daemon sysfs interface | ||
18 | |||
19 | full_scans: how many times all mergeable areas have been | ||
20 | scanned. | ||
21 | |||
22 | pages_shared: how many shared pages are being used. | ||
23 | |||
24 | pages_sharing: how many more sites are sharing them i.e. how | ||
25 | much saved. | ||
26 | |||
27 | pages_to_scan: how many present pages to scan before ksmd goes | ||
28 | to sleep. | ||
29 | |||
30 | pages_unshared: how many pages unique but repeatedly checked | ||
31 | for merging. | ||
32 | |||
33 | pages_volatile: how many pages changing too fast to be placed | ||
34 | in a tree. | ||
35 | |||
36 | run: write 0 to disable ksm, read 0 while ksm is disabled. | ||
37 | write 1 to run ksm, read 1 while ksm is running. | ||
38 | write 2 to disable ksm and unmerge all its pages. | ||
39 | |||
40 | sleep_millisecs: how many milliseconds ksm should sleep between | ||
41 | scans. | ||
42 | |||
43 | See Documentation/vm/ksm.txt for more information. | ||
44 | |||
45 | What: /sys/kernel/mm/ksm/merge_across_nodes | ||
46 | Date: January 2013 | ||
47 | KernelVersion: 3.9 | ||
48 | Contact: Linux memory management mailing list <linux-mm@kvack.org> | ||
49 | Description: Control merging pages across different NUMA nodes. | ||
50 | |||
51 | When it is set to 0 only pages from the same node are merged, | ||
52 | otherwise pages from all nodes can be merged together (default). | ||
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9aa8ff3e54dc..766087781ecd 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1640,6 +1640,42 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
1640 | that the amount of memory usable for all allocations | 1640 | that the amount of memory usable for all allocations |
1641 | is not too small. | 1641 | is not too small. |
1642 | 1642 | ||
1643 | movablemem_map=acpi | ||
1644 | [KNL,X86,IA-64,PPC] This parameter is similar to | ||
1645 | memmap except it specifies the memory map of | ||
1646 | ZONE_MOVABLE. | ||
1647 | This option inform the kernel to use Hot Pluggable bit | ||
1648 | in flags from SRAT from ACPI BIOS to determine which | ||
1649 | memory devices could be hotplugged. The corresponding | ||
1650 | memory ranges will be set as ZONE_MOVABLE. | ||
1651 | NOTE: Whatever node the kernel resides in will always | ||
1652 | be un-hotpluggable. | ||
1653 | |||
1654 | movablemem_map=nn[KMG]@ss[KMG] | ||
1655 | [KNL,X86,IA-64,PPC] This parameter is similar to | ||
1656 | memmap except it specifies the memory map of | ||
1657 | ZONE_MOVABLE. | ||
1658 | If user specifies memory ranges, the info in SRAT will | ||
1659 | be ingored. And it works like the following: | ||
1660 | - If more ranges are all within one node, then from | ||
1661 | lowest ss to the end of the node will be ZONE_MOVABLE. | ||
1662 | - If a range is within a node, then from ss to the end | ||
1663 | of the node will be ZONE_MOVABLE. | ||
1664 | - If a range covers two or more nodes, then from ss to | ||
1665 | the end of the 1st node will be ZONE_MOVABLE, and all | ||
1666 | the rest nodes will only have ZONE_MOVABLE. | ||
1667 | If memmap is specified at the same time, the | ||
1668 | movablemem_map will be limited within the memmap | ||
1669 | areas. If kernelcore or movablecore is also specified, | ||
1670 | movablemem_map will have higher priority to be | ||
1671 | satisfied. So the administrator should be careful that | ||
1672 | the amount of movablemem_map areas are not too large. | ||
1673 | Otherwise kernel won't have enough memory to start. | ||
1674 | NOTE: We don't stop users specifying the node the | ||
1675 | kernel resides in as hotpluggable so that this | ||
1676 | option can be used as a workaround of firmware | ||
1677 | bugs. | ||
1678 | |||
1643 | MTD_Partition= [MTD] | 1679 | MTD_Partition= [MTD] |
1644 | Format: <name>,<region-number>,<size>,<offset> | 1680 | Format: <name>,<region-number>,<size>,<offset> |
1645 | 1681 | ||
diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index b392e496f816..f34a8ee6f860 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt | |||
@@ -58,6 +58,21 @@ sleep_millisecs - how many milliseconds ksmd should sleep before next scan | |||
58 | e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs" | 58 | e.g. "echo 20 > /sys/kernel/mm/ksm/sleep_millisecs" |
59 | Default: 20 (chosen for demonstration purposes) | 59 | Default: 20 (chosen for demonstration purposes) |
60 | 60 | ||
61 | merge_across_nodes - specifies if pages from different numa nodes can be merged. | ||
62 | When set to 0, ksm merges only pages which physically | ||
63 | reside in the memory area of same NUMA node. That brings | ||
64 | lower latency to access of shared pages. Systems with more | ||
65 | nodes, at significant NUMA distances, are likely to benefit | ||
66 | from the lower latency of setting 0. Smaller systems, which | ||
67 | need to minimize memory usage, are likely to benefit from | ||
68 | the greater sharing of setting 1 (default). You may wish to | ||
69 | compare how your system performs under each setting, before | ||
70 | deciding on which to use. merge_across_nodes setting can be | ||
71 | changed only when there are no ksm shared pages in system: | ||
72 | set run 2 to unmerge pages first, then to 1 after changing | ||
73 | merge_across_nodes, to remerge according to the new setting. | ||
74 | Default: 1 (merging across nodes as in earlier releases) | ||
75 | |||
61 | run - set 0 to stop ksmd from running but keep merged pages, | 76 | run - set 0 to stop ksmd from running but keep merged pages, |
62 | set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run", | 77 | set 1 to run ksmd e.g. "echo 1 > /sys/kernel/mm/ksm/run", |
63 | set 2 to stop ksmd and unmerge all pages currently merged, | 78 | set 2 to stop ksmd and unmerge all pages currently merged, |
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index f4dd585898c5..224b44ab534e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c | |||
@@ -434,4 +434,7 @@ int __meminit vmemmap_populate(struct page *start_page, | |||
434 | return 0; | 434 | return 0; |
435 | } | 435 | } |
436 | #endif /* CONFIG_ARM64_64K_PAGES */ | 436 | #endif /* CONFIG_ARM64_64K_PAGES */ |
437 | void vmemmap_free(struct page *memmap, unsigned long nr_pages) | ||
438 | { | ||
439 | } | ||
437 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | 440 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ |
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 1516d1dc11fd..80dab509dfb0 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c | |||
@@ -93,7 +93,7 @@ void show_mem(unsigned int filter) | |||
93 | printk(KERN_INFO "%d pages swap cached\n", total_cached); | 93 | printk(KERN_INFO "%d pages swap cached\n", total_cached); |
94 | printk(KERN_INFO "Total of %ld pages in page table cache\n", | 94 | printk(KERN_INFO "Total of %ld pages in page table cache\n", |
95 | quicklist_total_size()); | 95 | quicklist_total_size()); |
96 | printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages()); | 96 | printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); |
97 | } | 97 | } |
98 | 98 | ||
99 | 99 | ||
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index c641333cd997..c2e955ee79a8 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c | |||
@@ -666,7 +666,7 @@ void show_mem(unsigned int filter) | |||
666 | printk(KERN_INFO "%d pages swap cached\n", total_cached); | 666 | printk(KERN_INFO "%d pages swap cached\n", total_cached); |
667 | printk(KERN_INFO "Total of %ld pages in page table cache\n", | 667 | printk(KERN_INFO "Total of %ld pages in page table cache\n", |
668 | quicklist_total_size()); | 668 | quicklist_total_size()); |
669 | printk(KERN_INFO "%d free buffer pages\n", nr_free_buffer_pages()); | 669 | printk(KERN_INFO "%ld free buffer pages\n", nr_free_buffer_pages()); |
670 | } | 670 | } |
671 | 671 | ||
672 | /** | 672 | /** |
@@ -822,4 +822,8 @@ int __meminit vmemmap_populate(struct page *start_page, | |||
822 | { | 822 | { |
823 | return vmemmap_populate_basepages(start_page, size, node); | 823 | return vmemmap_populate_basepages(start_page, size, node); |
824 | } | 824 | } |
825 | |||
826 | void vmemmap_free(struct page *memmap, unsigned long nr_pages) | ||
827 | { | ||
828 | } | ||
825 | #endif | 829 | #endif |
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index b755ea92aea7..20bc967c7209 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c | |||
@@ -688,6 +688,24 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
688 | 688 | ||
689 | return ret; | 689 | return ret; |
690 | } | 690 | } |
691 | |||
692 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
693 | int arch_remove_memory(u64 start, u64 size) | ||
694 | { | ||
695 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
696 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
697 | struct zone *zone; | ||
698 | int ret; | ||
699 | |||
700 | zone = page_zone(pfn_to_page(start_pfn)); | ||
701 | ret = __remove_pages(zone, start_pfn, nr_pages); | ||
702 | if (ret) | ||
703 | pr_warn("%s: Problem encountered in __remove_pages() as" | ||
704 | " ret=%d\n", __func__, ret); | ||
705 | |||
706 | return ret; | ||
707 | } | ||
708 | #endif | ||
691 | #endif | 709 | #endif |
692 | 710 | ||
693 | /* | 711 | /* |
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 95a45293e5ac..7e2246fb2f31 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c | |||
@@ -297,5 +297,10 @@ int __meminit vmemmap_populate(struct page *start_page, | |||
297 | 297 | ||
298 | return 0; | 298 | return 0; |
299 | } | 299 | } |
300 | |||
301 | void vmemmap_free(struct page *memmap, unsigned long nr_pages) | ||
302 | { | ||
303 | } | ||
304 | |||
300 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | 305 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ |
301 | 306 | ||
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 40df7c8f2096..f1f7409a4183 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c | |||
@@ -133,6 +133,18 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
133 | 133 | ||
134 | return __add_pages(nid, zone, start_pfn, nr_pages); | 134 | return __add_pages(nid, zone, start_pfn, nr_pages); |
135 | } | 135 | } |
136 | |||
137 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
138 | int arch_remove_memory(u64 start, u64 size) | ||
139 | { | ||
140 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
141 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
142 | struct zone *zone; | ||
143 | |||
144 | zone = page_zone(pfn_to_page(start_pfn)); | ||
145 | return __remove_pages(zone, start_pfn, nr_pages); | ||
146 | } | ||
147 | #endif | ||
136 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 148 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
137 | 149 | ||
138 | /* | 150 | /* |
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ae672f41c464..49ce6bb2c641 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c | |||
@@ -228,4 +228,16 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
228 | vmem_remove_mapping(start, size); | 228 | vmem_remove_mapping(start, size); |
229 | return rc; | 229 | return rc; |
230 | } | 230 | } |
231 | |||
232 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
233 | int arch_remove_memory(u64 start, u64 size) | ||
234 | { | ||
235 | /* | ||
236 | * There is no hardware or firmware interface which could trigger a | ||
237 | * hot memory remove on s390. So there is nothing that needs to be | ||
238 | * implemented. | ||
239 | */ | ||
240 | return -EBUSY; | ||
241 | } | ||
242 | #endif | ||
231 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 243 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 79699f46a443..e21aaf4f5cb6 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c | |||
@@ -268,6 +268,10 @@ out: | |||
268 | return ret; | 268 | return ret; |
269 | } | 269 | } |
270 | 270 | ||
271 | void vmemmap_free(struct page *memmap, unsigned long nr_pages) | ||
272 | { | ||
273 | } | ||
274 | |||
271 | /* | 275 | /* |
272 | * Add memory segment to the segment list if it doesn't overlap with | 276 | * Add memory segment to the segment list if it doesn't overlap with |
273 | * an already present segment. | 277 | * an already present segment. |
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 82cc576fab15..105794037143 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c | |||
@@ -558,4 +558,21 @@ int memory_add_physaddr_to_nid(u64 addr) | |||
558 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | 558 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); |
559 | #endif | 559 | #endif |
560 | 560 | ||
561 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
562 | int arch_remove_memory(u64 start, u64 size) | ||
563 | { | ||
564 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
565 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
566 | struct zone *zone; | ||
567 | int ret; | ||
568 | |||
569 | zone = page_zone(pfn_to_page(start_pfn)); | ||
570 | ret = __remove_pages(zone, start_pfn, nr_pages); | ||
571 | if (unlikely(ret)) | ||
572 | pr_warn("%s: Failed, __remove_pages() == %d\n", __func__, | ||
573 | ret); | ||
574 | |||
575 | return ret; | ||
576 | } | ||
577 | #endif | ||
561 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 578 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index dde85ef1c56d..48e0c030e8f5 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c | |||
@@ -57,7 +57,7 @@ void show_mem(unsigned int filter) | |||
57 | printk("Mem-info:\n"); | 57 | printk("Mem-info:\n"); |
58 | show_free_areas(filter); | 58 | show_free_areas(filter); |
59 | printk("Free swap: %6ldkB\n", | 59 | printk("Free swap: %6ldkB\n", |
60 | nr_swap_pages << (PAGE_SHIFT-10)); | 60 | get_nr_swap_pages() << (PAGE_SHIFT-10)); |
61 | printk("%ld pages of RAM\n", totalram_pages); | 61 | printk("%ld pages of RAM\n", totalram_pages); |
62 | printk("%ld free pages\n", nr_free_pages()); | 62 | printk("%ld free pages\n", nr_free_pages()); |
63 | } | 63 | } |
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 5c2c6e61facb..1588d33d5492 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c | |||
@@ -2235,6 +2235,11 @@ void __meminit vmemmap_populate_print_last(void) | |||
2235 | node_start = 0; | 2235 | node_start = 0; |
2236 | } | 2236 | } |
2237 | } | 2237 | } |
2238 | |||
2239 | void vmemmap_free(struct page *memmap, unsigned long nr_pages) | ||
2240 | { | ||
2241 | } | ||
2242 | |||
2238 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | 2243 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ |
2239 | 2244 | ||
2240 | static void prot_init_common(unsigned long page_none, | 2245 | static void prot_init_common(unsigned long page_none, |
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c index 3cfa98bf9125..743c951c61b0 100644 --- a/arch/tile/mm/elf.c +++ b/arch/tile/mm/elf.c | |||
@@ -130,7 +130,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, | |||
130 | if (!retval) { | 130 | if (!retval) { |
131 | unsigned long addr = MEM_USER_INTRPT; | 131 | unsigned long addr = MEM_USER_INTRPT; |
132 | addr = mmap_region(NULL, addr, INTRPT_SIZE, | 132 | addr = mmap_region(NULL, addr, INTRPT_SIZE, |
133 | MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE, | ||
134 | VM_READ|VM_EXEC| | 133 | VM_READ|VM_EXEC| |
135 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0); | 134 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, 0); |
136 | if (addr > (unsigned long) -PAGE_SIZE) | 135 | if (addr > (unsigned long) -PAGE_SIZE) |
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index ef29d6c5e10e..2749515a0547 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c | |||
@@ -935,6 +935,14 @@ int remove_memory(u64 start, u64 size) | |||
935 | { | 935 | { |
936 | return -EINVAL; | 936 | return -EINVAL; |
937 | } | 937 | } |
938 | |||
939 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
940 | int arch_remove_memory(u64 start, u64 size) | ||
941 | { | ||
942 | /* TODO */ | ||
943 | return -EBUSY; | ||
944 | } | ||
945 | #endif | ||
938 | #endif | 946 | #endif |
939 | 947 | ||
940 | struct kmem_cache *pgd_cache; | 948 | struct kmem_cache *pgd_cache; |
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index de0de0c0e8a1..b3b4972c2451 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c | |||
@@ -61,7 +61,7 @@ void show_mem(unsigned int filter) | |||
61 | global_page_state(NR_PAGETABLE), | 61 | global_page_state(NR_PAGETABLE), |
62 | global_page_state(NR_BOUNCE), | 62 | global_page_state(NR_BOUNCE), |
63 | global_page_state(NR_FILE_PAGES), | 63 | global_page_state(NR_FILE_PAGES), |
64 | nr_swap_pages); | 64 | get_nr_swap_pages()); |
65 | 65 | ||
66 | for_each_zone(zone) { | 66 | for_each_zone(zone) { |
67 | unsigned long flags, order, total = 0, largest_order = -1; | 67 | unsigned long flags, order, total = 0, largest_order = -1; |
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 52560a2038e1..1b99ee5c9f00 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h | |||
@@ -57,8 +57,8 @@ static inline int numa_cpu_node(int cpu) | |||
57 | #endif | 57 | #endif |
58 | 58 | ||
59 | #ifdef CONFIG_NUMA | 59 | #ifdef CONFIG_NUMA |
60 | extern void __cpuinit numa_set_node(int cpu, int node); | 60 | extern void numa_set_node(int cpu, int node); |
61 | extern void __cpuinit numa_clear_node(int cpu); | 61 | extern void numa_clear_node(int cpu); |
62 | extern void __init init_cpu_to_node(void); | 62 | extern void __init init_cpu_to_node(void); |
63 | extern void __cpuinit numa_add_cpu(int cpu); | 63 | extern void __cpuinit numa_add_cpu(int cpu); |
64 | extern void __cpuinit numa_remove_cpu(int cpu); | 64 | extern void __cpuinit numa_remove_cpu(int cpu); |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index e6423002c10b..567b5d0632b2 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
@@ -351,6 +351,7 @@ static inline void update_page_count(int level, unsigned long pages) { } | |||
351 | * as a pte too. | 351 | * as a pte too. |
352 | */ | 352 | */ |
353 | extern pte_t *lookup_address(unsigned long address, unsigned int *level); | 353 | extern pte_t *lookup_address(unsigned long address, unsigned int *level); |
354 | extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase); | ||
354 | extern phys_addr_t slow_virt_to_phys(void *__address); | 355 | extern phys_addr_t slow_virt_to_phys(void *__address); |
355 | 356 | ||
356 | #endif /* !__ASSEMBLY__ */ | 357 | #endif /* !__ASSEMBLY__ */ |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index cfc755dc1607..230c8ea878e5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
@@ -696,6 +696,10 @@ EXPORT_SYMBOL(acpi_map_lsapic); | |||
696 | 696 | ||
697 | int acpi_unmap_lsapic(int cpu) | 697 | int acpi_unmap_lsapic(int cpu) |
698 | { | 698 | { |
699 | #ifdef CONFIG_ACPI_NUMA | ||
700 | set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE); | ||
701 | #endif | ||
702 | |||
699 | per_cpu(x86_cpu_to_apicid, cpu) = -1; | 703 | per_cpu(x86_cpu_to_apicid, cpu) = -1; |
700 | set_cpu_present(cpu, false); | 704 | set_cpu_present(cpu, false); |
701 | num_processors--; | 705 | num_processors--; |
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 915f5efefcf5..9c857f05cef0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -1056,6 +1056,15 @@ void __init setup_arch(char **cmdline_p) | |||
1056 | setup_bios_corruption_check(); | 1056 | setup_bios_corruption_check(); |
1057 | #endif | 1057 | #endif |
1058 | 1058 | ||
1059 | /* | ||
1060 | * In the memory hotplug case, the kernel needs info from SRAT to | ||
1061 | * determine which memory is hotpluggable before allocating memory | ||
1062 | * using memblock. | ||
1063 | */ | ||
1064 | acpi_boot_table_init(); | ||
1065 | early_acpi_boot_init(); | ||
1066 | early_parse_srat(); | ||
1067 | |||
1059 | #ifdef CONFIG_X86_32 | 1068 | #ifdef CONFIG_X86_32 |
1060 | printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", | 1069 | printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", |
1061 | (max_pfn_mapped<<PAGE_SHIFT) - 1); | 1070 | (max_pfn_mapped<<PAGE_SHIFT) - 1); |
@@ -1101,10 +1110,6 @@ void __init setup_arch(char **cmdline_p) | |||
1101 | /* | 1110 | /* |
1102 | * Parse the ACPI tables for possible boot-time SMP configuration. | 1111 | * Parse the ACPI tables for possible boot-time SMP configuration. |
1103 | */ | 1112 | */ |
1104 | acpi_boot_table_init(); | ||
1105 | |||
1106 | early_acpi_boot_init(); | ||
1107 | |||
1108 | initmem_init(); | 1113 | initmem_init(); |
1109 | memblock_find_dma_reserve(); | 1114 | memblock_find_dma_reserve(); |
1110 | 1115 | ||
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index b299724f6e34..2d19001151d5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -862,6 +862,18 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
862 | 862 | ||
863 | return __add_pages(nid, zone, start_pfn, nr_pages); | 863 | return __add_pages(nid, zone, start_pfn, nr_pages); |
864 | } | 864 | } |
865 | |||
866 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
867 | int arch_remove_memory(u64 start, u64 size) | ||
868 | { | ||
869 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
870 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
871 | struct zone *zone; | ||
872 | |||
873 | zone = page_zone(pfn_to_page(start_pfn)); | ||
874 | return __remove_pages(zone, start_pfn, nr_pages); | ||
875 | } | ||
876 | #endif | ||
865 | #endif | 877 | #endif |
866 | 878 | ||
867 | /* | 879 | /* |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3eba7f429880..474e28f10815 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -707,6 +707,343 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
707 | } | 707 | } |
708 | EXPORT_SYMBOL_GPL(arch_add_memory); | 708 | EXPORT_SYMBOL_GPL(arch_add_memory); |
709 | 709 | ||
710 | #define PAGE_INUSE 0xFD | ||
711 | |||
712 | static void __meminit free_pagetable(struct page *page, int order) | ||
713 | { | ||
714 | struct zone *zone; | ||
715 | bool bootmem = false; | ||
716 | unsigned long magic; | ||
717 | unsigned int nr_pages = 1 << order; | ||
718 | |||
719 | /* bootmem page has reserved flag */ | ||
720 | if (PageReserved(page)) { | ||
721 | __ClearPageReserved(page); | ||
722 | bootmem = true; | ||
723 | |||
724 | magic = (unsigned long)page->lru.next; | ||
725 | if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { | ||
726 | while (nr_pages--) | ||
727 | put_page_bootmem(page++); | ||
728 | } else | ||
729 | __free_pages_bootmem(page, order); | ||
730 | } else | ||
731 | free_pages((unsigned long)page_address(page), order); | ||
732 | |||
733 | /* | ||
734 | * SECTION_INFO pages and MIX_SECTION_INFO pages | ||
735 | * are all allocated by bootmem. | ||
736 | */ | ||
737 | if (bootmem) { | ||
738 | zone = page_zone(page); | ||
739 | zone_span_writelock(zone); | ||
740 | zone->present_pages += nr_pages; | ||
741 | zone_span_writeunlock(zone); | ||
742 | totalram_pages += nr_pages; | ||
743 | } | ||
744 | } | ||
745 | |||
746 | static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) | ||
747 | { | ||
748 | pte_t *pte; | ||
749 | int i; | ||
750 | |||
751 | for (i = 0; i < PTRS_PER_PTE; i++) { | ||
752 | pte = pte_start + i; | ||
753 | if (pte_val(*pte)) | ||
754 | return; | ||
755 | } | ||
756 | |||
757 | /* free a pte talbe */ | ||
758 | free_pagetable(pmd_page(*pmd), 0); | ||
759 | spin_lock(&init_mm.page_table_lock); | ||
760 | pmd_clear(pmd); | ||
761 | spin_unlock(&init_mm.page_table_lock); | ||
762 | } | ||
763 | |||
764 | static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) | ||
765 | { | ||
766 | pmd_t *pmd; | ||
767 | int i; | ||
768 | |||
769 | for (i = 0; i < PTRS_PER_PMD; i++) { | ||
770 | pmd = pmd_start + i; | ||
771 | if (pmd_val(*pmd)) | ||
772 | return; | ||
773 | } | ||
774 | |||
775 | /* free a pmd talbe */ | ||
776 | free_pagetable(pud_page(*pud), 0); | ||
777 | spin_lock(&init_mm.page_table_lock); | ||
778 | pud_clear(pud); | ||
779 | spin_unlock(&init_mm.page_table_lock); | ||
780 | } | ||
781 | |||
782 | /* Return true if pgd is changed, otherwise return false. */ | ||
783 | static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) | ||
784 | { | ||
785 | pud_t *pud; | ||
786 | int i; | ||
787 | |||
788 | for (i = 0; i < PTRS_PER_PUD; i++) { | ||
789 | pud = pud_start + i; | ||
790 | if (pud_val(*pud)) | ||
791 | return false; | ||
792 | } | ||
793 | |||
794 | /* free a pud table */ | ||
795 | free_pagetable(pgd_page(*pgd), 0); | ||
796 | spin_lock(&init_mm.page_table_lock); | ||
797 | pgd_clear(pgd); | ||
798 | spin_unlock(&init_mm.page_table_lock); | ||
799 | |||
800 | return true; | ||
801 | } | ||
802 | |||
803 | static void __meminit | ||
804 | remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, | ||
805 | bool direct) | ||
806 | { | ||
807 | unsigned long next, pages = 0; | ||
808 | pte_t *pte; | ||
809 | void *page_addr; | ||
810 | phys_addr_t phys_addr; | ||
811 | |||
812 | pte = pte_start + pte_index(addr); | ||
813 | for (; addr < end; addr = next, pte++) { | ||
814 | next = (addr + PAGE_SIZE) & PAGE_MASK; | ||
815 | if (next > end) | ||
816 | next = end; | ||
817 | |||
818 | if (!pte_present(*pte)) | ||
819 | continue; | ||
820 | |||
821 | /* | ||
822 | * We mapped [0,1G) memory as identity mapping when | ||
823 | * initializing, in arch/x86/kernel/head_64.S. These | ||
824 | * pagetables cannot be removed. | ||
825 | */ | ||
826 | phys_addr = pte_val(*pte) + (addr & PAGE_MASK); | ||
827 | if (phys_addr < (phys_addr_t)0x40000000) | ||
828 | return; | ||
829 | |||
830 | if (IS_ALIGNED(addr, PAGE_SIZE) && | ||
831 | IS_ALIGNED(next, PAGE_SIZE)) { | ||
832 | /* | ||
833 | * Do not free direct mapping pages since they were | ||
834 | * freed when offlining, or simplely not in use. | ||
835 | */ | ||
836 | if (!direct) | ||
837 | free_pagetable(pte_page(*pte), 0); | ||
838 | |||
839 | spin_lock(&init_mm.page_table_lock); | ||
840 | pte_clear(&init_mm, addr, pte); | ||
841 | spin_unlock(&init_mm.page_table_lock); | ||
842 | |||
843 | /* For non-direct mapping, pages means nothing. */ | ||
844 | pages++; | ||
845 | } else { | ||
846 | /* | ||
847 | * If we are here, we are freeing vmemmap pages since | ||
848 | * direct mapped memory ranges to be freed are aligned. | ||
849 | * | ||
850 | * If we are not removing the whole page, it means | ||
851 | * other page structs in this page are being used and | ||
852 | * we canot remove them. So fill the unused page_structs | ||
853 | * with 0xFD, and remove the page when it is wholly | ||
854 | * filled with 0xFD. | ||
855 | */ | ||
856 | memset((void *)addr, PAGE_INUSE, next - addr); | ||
857 | |||
858 | page_addr = page_address(pte_page(*pte)); | ||
859 | if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { | ||
860 | free_pagetable(pte_page(*pte), 0); | ||
861 | |||
862 | spin_lock(&init_mm.page_table_lock); | ||
863 | pte_clear(&init_mm, addr, pte); | ||
864 | spin_unlock(&init_mm.page_table_lock); | ||
865 | } | ||
866 | } | ||
867 | } | ||
868 | |||
869 | /* Call free_pte_table() in remove_pmd_table(). */ | ||
870 | flush_tlb_all(); | ||
871 | if (direct) | ||
872 | update_page_count(PG_LEVEL_4K, -pages); | ||
873 | } | ||
874 | |||
875 | static void __meminit | ||
876 | remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, | ||
877 | bool direct) | ||
878 | { | ||
879 | unsigned long next, pages = 0; | ||
880 | pte_t *pte_base; | ||
881 | pmd_t *pmd; | ||
882 | void *page_addr; | ||
883 | |||
884 | pmd = pmd_start + pmd_index(addr); | ||
885 | for (; addr < end; addr = next, pmd++) { | ||
886 | next = pmd_addr_end(addr, end); | ||
887 | |||
888 | if (!pmd_present(*pmd)) | ||
889 | continue; | ||
890 | |||
891 | if (pmd_large(*pmd)) { | ||
892 | if (IS_ALIGNED(addr, PMD_SIZE) && | ||
893 | IS_ALIGNED(next, PMD_SIZE)) { | ||
894 | if (!direct) | ||
895 | free_pagetable(pmd_page(*pmd), | ||
896 | get_order(PMD_SIZE)); | ||
897 | |||
898 | spin_lock(&init_mm.page_table_lock); | ||
899 | pmd_clear(pmd); | ||
900 | spin_unlock(&init_mm.page_table_lock); | ||
901 | pages++; | ||
902 | } else { | ||
903 | /* If here, we are freeing vmemmap pages. */ | ||
904 | memset((void *)addr, PAGE_INUSE, next - addr); | ||
905 | |||
906 | page_addr = page_address(pmd_page(*pmd)); | ||
907 | if (!memchr_inv(page_addr, PAGE_INUSE, | ||
908 | PMD_SIZE)) { | ||
909 | free_pagetable(pmd_page(*pmd), | ||
910 | get_order(PMD_SIZE)); | ||
911 | |||
912 | spin_lock(&init_mm.page_table_lock); | ||
913 | pmd_clear(pmd); | ||
914 | spin_unlock(&init_mm.page_table_lock); | ||
915 | } | ||
916 | } | ||
917 | |||
918 | continue; | ||
919 | } | ||
920 | |||
921 | pte_base = (pte_t *)pmd_page_vaddr(*pmd); | ||
922 | remove_pte_table(pte_base, addr, next, direct); | ||
923 | free_pte_table(pte_base, pmd); | ||
924 | } | ||
925 | |||
926 | /* Call free_pmd_table() in remove_pud_table(). */ | ||
927 | if (direct) | ||
928 | update_page_count(PG_LEVEL_2M, -pages); | ||
929 | } | ||
930 | |||
931 | static void __meminit | ||
932 | remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, | ||
933 | bool direct) | ||
934 | { | ||
935 | unsigned long next, pages = 0; | ||
936 | pmd_t *pmd_base; | ||
937 | pud_t *pud; | ||
938 | void *page_addr; | ||
939 | |||
940 | pud = pud_start + pud_index(addr); | ||
941 | for (; addr < end; addr = next, pud++) { | ||
942 | next = pud_addr_end(addr, end); | ||
943 | |||
944 | if (!pud_present(*pud)) | ||
945 | continue; | ||
946 | |||
947 | if (pud_large(*pud)) { | ||
948 | if (IS_ALIGNED(addr, PUD_SIZE) && | ||
949 | IS_ALIGNED(next, PUD_SIZE)) { | ||
950 | if (!direct) | ||
951 | free_pagetable(pud_page(*pud), | ||
952 | get_order(PUD_SIZE)); | ||
953 | |||
954 | spin_lock(&init_mm.page_table_lock); | ||
955 | pud_clear(pud); | ||
956 | spin_unlock(&init_mm.page_table_lock); | ||
957 | pages++; | ||
958 | } else { | ||
959 | /* If here, we are freeing vmemmap pages. */ | ||
960 | memset((void *)addr, PAGE_INUSE, next - addr); | ||
961 | |||
962 | page_addr = page_address(pud_page(*pud)); | ||
963 | if (!memchr_inv(page_addr, PAGE_INUSE, | ||
964 | PUD_SIZE)) { | ||
965 | free_pagetable(pud_page(*pud), | ||
966 | get_order(PUD_SIZE)); | ||
967 | |||
968 | spin_lock(&init_mm.page_table_lock); | ||
969 | pud_clear(pud); | ||
970 | spin_unlock(&init_mm.page_table_lock); | ||
971 | } | ||
972 | } | ||
973 | |||
974 | continue; | ||
975 | } | ||
976 | |||
977 | pmd_base = (pmd_t *)pud_page_vaddr(*pud); | ||
978 | remove_pmd_table(pmd_base, addr, next, direct); | ||
979 | free_pmd_table(pmd_base, pud); | ||
980 | } | ||
981 | |||
982 | if (direct) | ||
983 | update_page_count(PG_LEVEL_1G, -pages); | ||
984 | } | ||
985 | |||
986 | /* start and end are both virtual address. */ | ||
987 | static void __meminit | ||
988 | remove_pagetable(unsigned long start, unsigned long end, bool direct) | ||
989 | { | ||
990 | unsigned long next; | ||
991 | pgd_t *pgd; | ||
992 | pud_t *pud; | ||
993 | bool pgd_changed = false; | ||
994 | |||
995 | for (; start < end; start = next) { | ||
996 | next = pgd_addr_end(start, end); | ||
997 | |||
998 | pgd = pgd_offset_k(start); | ||
999 | if (!pgd_present(*pgd)) | ||
1000 | continue; | ||
1001 | |||
1002 | pud = (pud_t *)pgd_page_vaddr(*pgd); | ||
1003 | remove_pud_table(pud, start, next, direct); | ||
1004 | if (free_pud_table(pud, pgd)) | ||
1005 | pgd_changed = true; | ||
1006 | } | ||
1007 | |||
1008 | if (pgd_changed) | ||
1009 | sync_global_pgds(start, end - 1); | ||
1010 | |||
1011 | flush_tlb_all(); | ||
1012 | } | ||
1013 | |||
1014 | void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages) | ||
1015 | { | ||
1016 | unsigned long start = (unsigned long)memmap; | ||
1017 | unsigned long end = (unsigned long)(memmap + nr_pages); | ||
1018 | |||
1019 | remove_pagetable(start, end, false); | ||
1020 | } | ||
1021 | |||
1022 | static void __meminit | ||
1023 | kernel_physical_mapping_remove(unsigned long start, unsigned long end) | ||
1024 | { | ||
1025 | start = (unsigned long)__va(start); | ||
1026 | end = (unsigned long)__va(end); | ||
1027 | |||
1028 | remove_pagetable(start, end, true); | ||
1029 | } | ||
1030 | |||
1031 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
1032 | int __ref arch_remove_memory(u64 start, u64 size) | ||
1033 | { | ||
1034 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
1035 | unsigned long nr_pages = size >> PAGE_SHIFT; | ||
1036 | struct zone *zone; | ||
1037 | int ret; | ||
1038 | |||
1039 | zone = page_zone(pfn_to_page(start_pfn)); | ||
1040 | kernel_physical_mapping_remove(start, start + size); | ||
1041 | ret = __remove_pages(zone, start_pfn, nr_pages); | ||
1042 | WARN_ON_ONCE(ret); | ||
1043 | |||
1044 | return ret; | ||
1045 | } | ||
1046 | #endif | ||
710 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 1047 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
711 | 1048 | ||
712 | static struct kcore_list kcore_vsyscall; | 1049 | static struct kcore_list kcore_vsyscall; |
@@ -1019,6 +1356,66 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) | |||
1019 | return 0; | 1356 | return 0; |
1020 | } | 1357 | } |
1021 | 1358 | ||
1359 | #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE) | ||
1360 | void register_page_bootmem_memmap(unsigned long section_nr, | ||
1361 | struct page *start_page, unsigned long size) | ||
1362 | { | ||
1363 | unsigned long addr = (unsigned long)start_page; | ||
1364 | unsigned long end = (unsigned long)(start_page + size); | ||
1365 | unsigned long next; | ||
1366 | pgd_t *pgd; | ||
1367 | pud_t *pud; | ||
1368 | pmd_t *pmd; | ||
1369 | unsigned int nr_pages; | ||
1370 | struct page *page; | ||
1371 | |||
1372 | for (; addr < end; addr = next) { | ||
1373 | pte_t *pte = NULL; | ||
1374 | |||
1375 | pgd = pgd_offset_k(addr); | ||
1376 | if (pgd_none(*pgd)) { | ||
1377 | next = (addr + PAGE_SIZE) & PAGE_MASK; | ||
1378 | continue; | ||
1379 | } | ||
1380 | get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); | ||
1381 | |||
1382 | pud = pud_offset(pgd, addr); | ||
1383 | if (pud_none(*pud)) { | ||
1384 | next = (addr + PAGE_SIZE) & PAGE_MASK; | ||
1385 | continue; | ||
1386 | } | ||
1387 | get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO); | ||
1388 | |||
1389 | if (!cpu_has_pse) { | ||
1390 | next = (addr + PAGE_SIZE) & PAGE_MASK; | ||
1391 | pmd = pmd_offset(pud, addr); | ||
1392 | if (pmd_none(*pmd)) | ||
1393 | continue; | ||
1394 | get_page_bootmem(section_nr, pmd_page(*pmd), | ||
1395 | MIX_SECTION_INFO); | ||
1396 | |||
1397 | pte = pte_offset_kernel(pmd, addr); | ||
1398 | if (pte_none(*pte)) | ||
1399 | continue; | ||
1400 | get_page_bootmem(section_nr, pte_page(*pte), | ||
1401 | SECTION_INFO); | ||
1402 | } else { | ||
1403 | next = pmd_addr_end(addr, end); | ||
1404 | |||
1405 | pmd = pmd_offset(pud, addr); | ||
1406 | if (pmd_none(*pmd)) | ||
1407 | continue; | ||
1408 | |||
1409 | nr_pages = 1 << (get_order(PMD_SIZE)); | ||
1410 | page = pmd_page(*pmd); | ||
1411 | while (nr_pages--) | ||
1412 | get_page_bootmem(section_nr, page++, | ||
1413 | SECTION_INFO); | ||
1414 | } | ||
1415 | } | ||
1416 | } | ||
1417 | #endif | ||
1418 | |||
1022 | void __meminit vmemmap_populate_print_last(void) | 1419 | void __meminit vmemmap_populate_print_last(void) |
1023 | { | 1420 | { |
1024 | if (p_start) { | 1421 | if (p_start) { |
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 8504f3698753..dfd30259eb89 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
@@ -56,7 +56,7 @@ early_param("numa", numa_setup); | |||
56 | /* | 56 | /* |
57 | * apicid, cpu, node mappings | 57 | * apicid, cpu, node mappings |
58 | */ | 58 | */ |
59 | s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | 59 | s16 __apicid_to_node[MAX_LOCAL_APIC] = { |
60 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | 60 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE |
61 | }; | 61 | }; |
62 | 62 | ||
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(node_to_cpumask_map); | |||
78 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); | 78 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); |
79 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); | 79 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); |
80 | 80 | ||
81 | void __cpuinit numa_set_node(int cpu, int node) | 81 | void numa_set_node(int cpu, int node) |
82 | { | 82 | { |
83 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); | 83 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); |
84 | 84 | ||
@@ -101,7 +101,7 @@ void __cpuinit numa_set_node(int cpu, int node) | |||
101 | set_cpu_numa_node(cpu, node); | 101 | set_cpu_numa_node(cpu, node); |
102 | } | 102 | } |
103 | 103 | ||
104 | void __cpuinit numa_clear_node(int cpu) | 104 | void numa_clear_node(int cpu) |
105 | { | 105 | { |
106 | numa_set_node(cpu, NUMA_NO_NODE); | 106 | numa_set_node(cpu, NUMA_NO_NODE); |
107 | } | 107 | } |
@@ -213,10 +213,9 @@ static void __init setup_node_data(int nid, u64 start, u64 end) | |||
213 | * Allocate node data. Try node-local memory and then any node. | 213 | * Allocate node data. Try node-local memory and then any node. |
214 | * Never allocate in DMA zone. | 214 | * Never allocate in DMA zone. |
215 | */ | 215 | */ |
216 | nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); | 216 | nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); |
217 | if (!nd_pa) { | 217 | if (!nd_pa) { |
218 | pr_err("Cannot find %zu bytes in node %d\n", | 218 | pr_err("Cannot find %zu bytes in any node\n", nd_size); |
219 | nd_size, nid); | ||
220 | return; | 219 | return; |
221 | } | 220 | } |
222 | nd = __va(nd_pa); | 221 | nd = __va(nd_pa); |
@@ -561,10 +560,12 @@ static int __init numa_init(int (*init_func)(void)) | |||
561 | for (i = 0; i < MAX_LOCAL_APIC; i++) | 560 | for (i = 0; i < MAX_LOCAL_APIC; i++) |
562 | set_apicid_to_node(i, NUMA_NO_NODE); | 561 | set_apicid_to_node(i, NUMA_NO_NODE); |
563 | 562 | ||
564 | nodes_clear(numa_nodes_parsed); | 563 | /* |
564 | * Do not clear numa_nodes_parsed or zero numa_meminfo here, because | ||
565 | * SRAT was parsed earlier in early_parse_srat(). | ||
566 | */ | ||
565 | nodes_clear(node_possible_map); | 567 | nodes_clear(node_possible_map); |
566 | nodes_clear(node_online_map); | 568 | nodes_clear(node_online_map); |
567 | memset(&numa_meminfo, 0, sizeof(numa_meminfo)); | ||
568 | WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); | 569 | WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); |
569 | numa_reset_distance(); | 570 | numa_reset_distance(); |
570 | 571 | ||
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a1b1c88f9caf..ca1f1c2bb7be 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -529,21 +529,13 @@ out_unlock: | |||
529 | return do_split; | 529 | return do_split; |
530 | } | 530 | } |
531 | 531 | ||
532 | static int split_large_page(pte_t *kpte, unsigned long address) | 532 | int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase) |
533 | { | 533 | { |
534 | unsigned long pfn, pfninc = 1; | 534 | unsigned long pfn, pfninc = 1; |
535 | unsigned int i, level; | 535 | unsigned int i, level; |
536 | pte_t *pbase, *tmp; | 536 | pte_t *tmp; |
537 | pgprot_t ref_prot; | 537 | pgprot_t ref_prot; |
538 | struct page *base; | 538 | struct page *base = virt_to_page(pbase); |
539 | |||
540 | if (!debug_pagealloc) | ||
541 | spin_unlock(&cpa_lock); | ||
542 | base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); | ||
543 | if (!debug_pagealloc) | ||
544 | spin_lock(&cpa_lock); | ||
545 | if (!base) | ||
546 | return -ENOMEM; | ||
547 | 539 | ||
548 | spin_lock(&pgd_lock); | 540 | spin_lock(&pgd_lock); |
549 | /* | 541 | /* |
@@ -551,10 +543,11 @@ static int split_large_page(pte_t *kpte, unsigned long address) | |||
551 | * up for us already: | 543 | * up for us already: |
552 | */ | 544 | */ |
553 | tmp = lookup_address(address, &level); | 545 | tmp = lookup_address(address, &level); |
554 | if (tmp != kpte) | 546 | if (tmp != kpte) { |
555 | goto out_unlock; | 547 | spin_unlock(&pgd_lock); |
548 | return 1; | ||
549 | } | ||
556 | 550 | ||
557 | pbase = (pte_t *)page_address(base); | ||
558 | paravirt_alloc_pte(&init_mm, page_to_pfn(base)); | 551 | paravirt_alloc_pte(&init_mm, page_to_pfn(base)); |
559 | ref_prot = pte_pgprot(pte_clrhuge(*kpte)); | 552 | ref_prot = pte_pgprot(pte_clrhuge(*kpte)); |
560 | /* | 553 | /* |
@@ -601,17 +594,27 @@ static int split_large_page(pte_t *kpte, unsigned long address) | |||
601 | * going on. | 594 | * going on. |
602 | */ | 595 | */ |
603 | __flush_tlb_all(); | 596 | __flush_tlb_all(); |
597 | spin_unlock(&pgd_lock); | ||
604 | 598 | ||
605 | base = NULL; | 599 | return 0; |
600 | } | ||
606 | 601 | ||
607 | out_unlock: | 602 | static int split_large_page(pte_t *kpte, unsigned long address) |
608 | /* | 603 | { |
609 | * If we dropped out via the lookup_address check under | 604 | pte_t *pbase; |
610 | * pgd_lock then stick the page back into the pool: | 605 | struct page *base; |
611 | */ | 606 | |
612 | if (base) | 607 | if (!debug_pagealloc) |
608 | spin_unlock(&cpa_lock); | ||
609 | base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); | ||
610 | if (!debug_pagealloc) | ||
611 | spin_lock(&cpa_lock); | ||
612 | if (!base) | ||
613 | return -ENOMEM; | ||
614 | |||
615 | pbase = (pte_t *)page_address(base); | ||
616 | if (__split_large_page(kpte, address, pbase)) | ||
613 | __free_page(base); | 617 | __free_page(base); |
614 | spin_unlock(&pgd_lock); | ||
615 | 618 | ||
616 | return 0; | 619 | return 0; |
617 | } | 620 | } |
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index cdd0da9dd530..79836d01f789 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c | |||
@@ -141,11 +141,126 @@ static inline int save_add_info(void) {return 1;} | |||
141 | static inline int save_add_info(void) {return 0;} | 141 | static inline int save_add_info(void) {return 0;} |
142 | #endif | 142 | #endif |
143 | 143 | ||
144 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
145 | static void __init | ||
146 | handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) | ||
147 | { | ||
148 | int overlap, i; | ||
149 | unsigned long start_pfn, end_pfn; | ||
150 | |||
151 | start_pfn = PFN_DOWN(start); | ||
152 | end_pfn = PFN_UP(end); | ||
153 | |||
154 | /* | ||
155 | * For movablemem_map=acpi: | ||
156 | * | ||
157 | * SRAT: |_____| |_____| |_________| |_________| ...... | ||
158 | * node id: 0 1 1 2 | ||
159 | * hotpluggable: n y y n | ||
160 | * movablemem_map: |_____| |_________| | ||
161 | * | ||
162 | * Using movablemem_map, we can prevent memblock from allocating memory | ||
163 | * on ZONE_MOVABLE at boot time. | ||
164 | * | ||
165 | * Before parsing SRAT, memblock has already reserve some memory ranges | ||
166 | * for other purposes, such as for kernel image. We cannot prevent | ||
167 | * kernel from using these memory, so we need to exclude these memory | ||
168 | * even if it is hotpluggable. | ||
169 | * Furthermore, to ensure the kernel has enough memory to boot, we make | ||
170 | * all the memory on the node which the kernel resides in | ||
171 | * un-hotpluggable. | ||
172 | */ | ||
173 | if (hotpluggable && movablemem_map.acpi) { | ||
174 | /* Exclude ranges reserved by memblock. */ | ||
175 | struct memblock_type *rgn = &memblock.reserved; | ||
176 | |||
177 | for (i = 0; i < rgn->cnt; i++) { | ||
178 | if (end <= rgn->regions[i].base || | ||
179 | start >= rgn->regions[i].base + | ||
180 | rgn->regions[i].size) | ||
181 | continue; | ||
182 | |||
183 | /* | ||
184 | * If the memory range overlaps the memory reserved by | ||
185 | * memblock, then the kernel resides in this node. | ||
186 | */ | ||
187 | node_set(node, movablemem_map.numa_nodes_kernel); | ||
188 | |||
189 | goto out; | ||
190 | } | ||
191 | |||
192 | /* | ||
193 | * If the kernel resides in this node, then the whole node | ||
194 | * should not be hotpluggable. | ||
195 | */ | ||
196 | if (node_isset(node, movablemem_map.numa_nodes_kernel)) | ||
197 | goto out; | ||
198 | |||
199 | insert_movablemem_map(start_pfn, end_pfn); | ||
200 | |||
201 | /* | ||
202 | * numa_nodes_hotplug nodemask represents which nodes are put | ||
203 | * into movablemem_map.map[]. | ||
204 | */ | ||
205 | node_set(node, movablemem_map.numa_nodes_hotplug); | ||
206 | goto out; | ||
207 | } | ||
208 | |||
209 | /* | ||
210 | * For movablemem_map=nn[KMG]@ss[KMG]: | ||
211 | * | ||
212 | * SRAT: |_____| |_____| |_________| |_________| ...... | ||
213 | * node id: 0 1 1 2 | ||
214 | * user specified: |__| |___| | ||
215 | * movablemem_map: |___| |_________| |______| ...... | ||
216 | * | ||
217 | * Using movablemem_map, we can prevent memblock from allocating memory | ||
218 | * on ZONE_MOVABLE at boot time. | ||
219 | * | ||
220 | * NOTE: In this case, SRAT info will be ingored. | ||
221 | */ | ||
222 | overlap = movablemem_map_overlap(start_pfn, end_pfn); | ||
223 | if (overlap >= 0) { | ||
224 | /* | ||
225 | * If part of this range is in movablemem_map, we need to | ||
226 | * add the range after it to extend the range to the end | ||
227 | * of the node, because from the min address specified to | ||
228 | * the end of the node will be ZONE_MOVABLE. | ||
229 | */ | ||
230 | start_pfn = max(start_pfn, | ||
231 | movablemem_map.map[overlap].start_pfn); | ||
232 | insert_movablemem_map(start_pfn, end_pfn); | ||
233 | |||
234 | /* | ||
235 | * Set the nodemask, so that if the address range on one node | ||
236 | * is not continuse, we can add the subsequent ranges on the | ||
237 | * same node into movablemem_map. | ||
238 | */ | ||
239 | node_set(node, movablemem_map.numa_nodes_hotplug); | ||
240 | } else { | ||
241 | if (node_isset(node, movablemem_map.numa_nodes_hotplug)) | ||
242 | /* | ||
243 | * Insert the range if we already have movable ranges | ||
244 | * on the same node. | ||
245 | */ | ||
246 | insert_movablemem_map(start_pfn, end_pfn); | ||
247 | } | ||
248 | out: | ||
249 | return; | ||
250 | } | ||
251 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | ||
252 | static inline void | ||
253 | handle_movablemem(int node, u64 start, u64 end, u32 hotpluggable) | ||
254 | { | ||
255 | } | ||
256 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | ||
257 | |||
144 | /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ | 258 | /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ |
145 | int __init | 259 | int __init |
146 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | 260 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) |
147 | { | 261 | { |
148 | u64 start, end; | 262 | u64 start, end; |
263 | u32 hotpluggable; | ||
149 | int node, pxm; | 264 | int node, pxm; |
150 | 265 | ||
151 | if (srat_disabled()) | 266 | if (srat_disabled()) |
@@ -154,7 +269,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | |||
154 | goto out_err_bad_srat; | 269 | goto out_err_bad_srat; |
155 | if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) | 270 | if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) |
156 | goto out_err; | 271 | goto out_err; |
157 | if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) | 272 | hotpluggable = ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE; |
273 | if (hotpluggable && !save_add_info()) | ||
158 | goto out_err; | 274 | goto out_err; |
159 | 275 | ||
160 | start = ma->base_address; | 276 | start = ma->base_address; |
@@ -174,9 +290,12 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | |||
174 | 290 | ||
175 | node_set(node, numa_nodes_parsed); | 291 | node_set(node, numa_nodes_parsed); |
176 | 292 | ||
177 | printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", | 293 | printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx] %s\n", |
178 | node, pxm, | 294 | node, pxm, |
179 | (unsigned long long) start, (unsigned long long) end - 1); | 295 | (unsigned long long) start, (unsigned long long) end - 1, |
296 | hotpluggable ? "Hot Pluggable": ""); | ||
297 | |||
298 | handle_movablemem(node, start, end, hotpluggable); | ||
180 | 299 | ||
181 | return 0; | 300 | return 0; |
182 | out_err_bad_srat: | 301 | out_err_bad_srat: |
diff --git a/block/genhd.c b/block/genhd.c index 3993ebf4135f..5f73c2435fde 100644 --- a/block/genhd.c +++ b/block/genhd.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/mutex.h> | 18 | #include <linux/mutex.h> |
19 | #include <linux/idr.h> | 19 | #include <linux/idr.h> |
20 | #include <linux/log2.h> | 20 | #include <linux/log2.h> |
21 | #include <linux/pm_runtime.h> | ||
21 | 22 | ||
22 | #include "blk.h" | 23 | #include "blk.h" |
23 | 24 | ||
@@ -534,6 +535,14 @@ static void register_disk(struct gendisk *disk) | |||
534 | return; | 535 | return; |
535 | } | 536 | } |
536 | } | 537 | } |
538 | |||
539 | /* | ||
540 | * avoid probable deadlock caused by allocating memory with | ||
541 | * GFP_KERNEL in runtime_resume callback of its all ancestor | ||
542 | * devices | ||
543 | */ | ||
544 | pm_runtime_set_memalloc_noio(ddev, true); | ||
545 | |||
537 | disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); | 546 | disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); |
538 | disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); | 547 | disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); |
539 | 548 | ||
@@ -663,6 +672,7 @@ void del_gendisk(struct gendisk *disk) | |||
663 | disk->driverfs_dev = NULL; | 672 | disk->driverfs_dev = NULL; |
664 | if (!sysfs_deprecated) | 673 | if (!sysfs_deprecated) |
665 | sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); | 674 | sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); |
675 | pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); | ||
666 | device_del(disk_to_dev(disk)); | 676 | device_del(disk_to_dev(disk)); |
667 | } | 677 | } |
668 | EXPORT_SYMBOL(del_gendisk); | 678 | EXPORT_SYMBOL(del_gendisk); |
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 034d3e72aa92..da1f82b445e0 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c | |||
@@ -280,9 +280,11 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) | |||
280 | 280 | ||
281 | static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device) | 281 | static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device) |
282 | { | 282 | { |
283 | int result = 0; | 283 | int result = 0, nid; |
284 | struct acpi_memory_info *info, *n; | 284 | struct acpi_memory_info *info, *n; |
285 | 285 | ||
286 | nid = acpi_get_node(mem_device->device->handle); | ||
287 | |||
286 | list_for_each_entry_safe(info, n, &mem_device->res_list, list) { | 288 | list_for_each_entry_safe(info, n, &mem_device->res_list, list) { |
287 | if (info->failed) | 289 | if (info->failed) |
288 | /* The kernel does not use this memory block */ | 290 | /* The kernel does not use this memory block */ |
@@ -295,7 +297,9 @@ static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device) | |||
295 | */ | 297 | */ |
296 | return -EBUSY; | 298 | return -EBUSY; |
297 | 299 | ||
298 | result = remove_memory(info->start_addr, info->length); | 300 | if (nid < 0) |
301 | nid = memory_add_physaddr_to_nid(info->start_addr); | ||
302 | result = remove_memory(nid, info->start_addr, info->length); | ||
299 | if (result) | 303 | if (result) |
300 | return result; | 304 | return result; |
301 | 305 | ||
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 33e609f63585..59844ee149be 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c | |||
@@ -282,10 +282,10 @@ acpi_table_parse_srat(enum acpi_srat_type id, | |||
282 | handler, max_entries); | 282 | handler, max_entries); |
283 | } | 283 | } |
284 | 284 | ||
285 | int __init acpi_numa_init(void) | 285 | static int srat_mem_cnt; |
286 | { | ||
287 | int cnt = 0; | ||
288 | 286 | ||
287 | void __init early_parse_srat(void) | ||
288 | { | ||
289 | /* | 289 | /* |
290 | * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= | 290 | * Should not limit number with cpu num that is from NR_CPUS or nr_cpus= |
291 | * SRAT cpu entries could have different order with that in MADT. | 291 | * SRAT cpu entries could have different order with that in MADT. |
@@ -295,21 +295,24 @@ int __init acpi_numa_init(void) | |||
295 | /* SRAT: Static Resource Affinity Table */ | 295 | /* SRAT: Static Resource Affinity Table */ |
296 | if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { | 296 | if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { |
297 | acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, | 297 | acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, |
298 | acpi_parse_x2apic_affinity, 0); | 298 | acpi_parse_x2apic_affinity, 0); |
299 | acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, | 299 | acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, |
300 | acpi_parse_processor_affinity, 0); | 300 | acpi_parse_processor_affinity, 0); |
301 | cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, | 301 | srat_mem_cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, |
302 | acpi_parse_memory_affinity, | 302 | acpi_parse_memory_affinity, |
303 | NR_NODE_MEMBLKS); | 303 | NR_NODE_MEMBLKS); |
304 | } | 304 | } |
305 | } | ||
305 | 306 | ||
307 | int __init acpi_numa_init(void) | ||
308 | { | ||
306 | /* SLIT: System Locality Information Table */ | 309 | /* SLIT: System Locality Information Table */ |
307 | acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); | 310 | acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit); |
308 | 311 | ||
309 | acpi_numa_arch_fixup(); | 312 | acpi_numa_arch_fixup(); |
310 | 313 | ||
311 | if (cnt < 0) | 314 | if (srat_mem_cnt < 0) |
312 | return cnt; | 315 | return srat_mem_cnt; |
313 | else if (!parsed_numa_memblks) | 316 | else if (!parsed_numa_memblks) |
314 | return -ENOENT; | 317 | return -ENOENT; |
315 | return 0; | 318 | return 0; |
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index cbf1f122666b..df34bd04ae62 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <linux/cpuidle.h> | 45 | #include <linux/cpuidle.h> |
46 | #include <linux/slab.h> | 46 | #include <linux/slab.h> |
47 | #include <linux/acpi.h> | 47 | #include <linux/acpi.h> |
48 | #include <linux/memory_hotplug.h> | ||
48 | 49 | ||
49 | #include <asm/io.h> | 50 | #include <asm/io.h> |
50 | #include <asm/cpu.h> | 51 | #include <asm/cpu.h> |
@@ -641,6 +642,7 @@ static int acpi_processor_remove(struct acpi_device *device) | |||
641 | 642 | ||
642 | per_cpu(processors, pr->id) = NULL; | 643 | per_cpu(processors, pr->id) = NULL; |
643 | per_cpu(processor_device_array, pr->id) = NULL; | 644 | per_cpu(processor_device_array, pr->id) = NULL; |
645 | try_offline_node(cpu_to_node(pr->id)); | ||
644 | 646 | ||
645 | free: | 647 | free: |
646 | free_cpumask_var(pr->throttling.shared_cpu_map); | 648 | free_cpumask_var(pr->throttling.shared_cpu_map); |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 83d0b17ba1c2..a51007b79032 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
@@ -693,6 +693,12 @@ int offline_memory_block(struct memory_block *mem) | |||
693 | return ret; | 693 | return ret; |
694 | } | 694 | } |
695 | 695 | ||
696 | /* return true if the memory block is offlined, otherwise, return false */ | ||
697 | bool is_memblock_offlined(struct memory_block *mem) | ||
698 | { | ||
699 | return mem->state == MEM_OFFLINE; | ||
700 | } | ||
701 | |||
696 | /* | 702 | /* |
697 | * Initialize the sysfs support for memory devices... | 703 | * Initialize the sysfs support for memory devices... |
698 | */ | 704 | */ |
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 3148b10dc2e5..1244930e3d7a 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c | |||
@@ -124,6 +124,76 @@ unsigned long pm_runtime_autosuspend_expiration(struct device *dev) | |||
124 | } | 124 | } |
125 | EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration); | 125 | EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration); |
126 | 126 | ||
127 | static int dev_memalloc_noio(struct device *dev, void *data) | ||
128 | { | ||
129 | return dev->power.memalloc_noio; | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag. | ||
134 | * @dev: Device to handle. | ||
135 | * @enable: True for setting the flag and False for clearing the flag. | ||
136 | * | ||
137 | * Set the flag for all devices in the path from the device to the | ||
138 | * root device in the device tree if @enable is true, otherwise clear | ||
139 | * the flag for devices in the path whose siblings don't set the flag. | ||
140 | * | ||
141 | * The function should only be called by block device, or network | ||
142 | * device driver for solving the deadlock problem during runtime | ||
143 | * resume/suspend: | ||
144 | * | ||
145 | * If memory allocation with GFP_KERNEL is called inside runtime | ||
146 | * resume/suspend callback of any one of its ancestors(or the | ||
147 | * block device itself), the deadlock may be triggered inside the | ||
148 | * memory allocation since it might not complete until the block | ||
149 | * device becomes active and the involed page I/O finishes. The | ||
150 | * situation is pointed out first by Alan Stern. Network device | ||
151 | * are involved in iSCSI kind of situation. | ||
152 | * | ||
153 | * The lock of dev_hotplug_mutex is held in the function for handling | ||
154 | * hotplug race because pm_runtime_set_memalloc_noio() may be called | ||
155 | * in async probe(). | ||
156 | * | ||
157 | * The function should be called between device_add() and device_del() | ||
158 | * on the affected device(block/network device). | ||
159 | */ | ||
160 | void pm_runtime_set_memalloc_noio(struct device *dev, bool enable) | ||
161 | { | ||
162 | static DEFINE_MUTEX(dev_hotplug_mutex); | ||
163 | |||
164 | mutex_lock(&dev_hotplug_mutex); | ||
165 | for (;;) { | ||
166 | bool enabled; | ||
167 | |||
168 | /* hold power lock since bitfield is not SMP-safe. */ | ||
169 | spin_lock_irq(&dev->power.lock); | ||
170 | enabled = dev->power.memalloc_noio; | ||
171 | dev->power.memalloc_noio = enable; | ||
172 | spin_unlock_irq(&dev->power.lock); | ||
173 | |||
174 | /* | ||
175 | * not need to enable ancestors any more if the device | ||
176 | * has been enabled. | ||
177 | */ | ||
178 | if (enabled && enable) | ||
179 | break; | ||
180 | |||
181 | dev = dev->parent; | ||
182 | |||
183 | /* | ||
184 | * clear flag of the parent device only if all the | ||
185 | * children don't set the flag because ancestor's | ||
186 | * flag was set by any one of the descendants. | ||
187 | */ | ||
188 | if (!dev || (!enable && | ||
189 | device_for_each_child(dev, NULL, | ||
190 | dev_memalloc_noio))) | ||
191 | break; | ||
192 | } | ||
193 | mutex_unlock(&dev_hotplug_mutex); | ||
194 | } | ||
195 | EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio); | ||
196 | |||
127 | /** | 197 | /** |
128 | * rpm_check_suspend_allowed - Test whether a device may be suspended. | 198 | * rpm_check_suspend_allowed - Test whether a device may be suspended. |
129 | * @dev: Device to test. | 199 | * @dev: Device to test. |
@@ -278,7 +348,24 @@ static int rpm_callback(int (*cb)(struct device *), struct device *dev) | |||
278 | if (!cb) | 348 | if (!cb) |
279 | return -ENOSYS; | 349 | return -ENOSYS; |
280 | 350 | ||
281 | retval = __rpm_callback(cb, dev); | 351 | if (dev->power.memalloc_noio) { |
352 | unsigned int noio_flag; | ||
353 | |||
354 | /* | ||
355 | * Deadlock might be caused if memory allocation with | ||
356 | * GFP_KERNEL happens inside runtime_suspend and | ||
357 | * runtime_resume callbacks of one block device's | ||
358 | * ancestor or the block device itself. Network | ||
359 | * device might be thought as part of iSCSI block | ||
360 | * device, so network device and its ancestor should | ||
361 | * be marked as memalloc_noio too. | ||
362 | */ | ||
363 | noio_flag = memalloc_noio_save(); | ||
364 | retval = __rpm_callback(cb, dev); | ||
365 | memalloc_noio_restore(noio_flag); | ||
366 | } else { | ||
367 | retval = __rpm_callback(cb, dev); | ||
368 | } | ||
282 | 369 | ||
283 | dev->power.runtime_error = retval; | 370 | dev->power.runtime_error = retval; |
284 | return retval != -EACCES ? retval : -EIO; | 371 | return retval != -EACCES ? retval : -EIO; |
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 90723e65b081..0b5b5f619c75 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/types.h> | 21 | #include <linux/types.h> |
22 | #include <linux/bootmem.h> | 22 | #include <linux/bootmem.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/mm.h> | ||
24 | 25 | ||
25 | /* | 26 | /* |
26 | * Data types ------------------------------------------------------------------ | 27 | * Data types ------------------------------------------------------------------ |
@@ -52,6 +53,9 @@ static ssize_t start_show(struct firmware_map_entry *entry, char *buf); | |||
52 | static ssize_t end_show(struct firmware_map_entry *entry, char *buf); | 53 | static ssize_t end_show(struct firmware_map_entry *entry, char *buf); |
53 | static ssize_t type_show(struct firmware_map_entry *entry, char *buf); | 54 | static ssize_t type_show(struct firmware_map_entry *entry, char *buf); |
54 | 55 | ||
56 | static struct firmware_map_entry * __meminit | ||
57 | firmware_map_find_entry(u64 start, u64 end, const char *type); | ||
58 | |||
55 | /* | 59 | /* |
56 | * Static data ----------------------------------------------------------------- | 60 | * Static data ----------------------------------------------------------------- |
57 | */ | 61 | */ |
@@ -79,7 +83,52 @@ static const struct sysfs_ops memmap_attr_ops = { | |||
79 | .show = memmap_attr_show, | 83 | .show = memmap_attr_show, |
80 | }; | 84 | }; |
81 | 85 | ||
82 | static struct kobj_type memmap_ktype = { | 86 | /* Firmware memory map entries. */ |
87 | static LIST_HEAD(map_entries); | ||
88 | static DEFINE_SPINLOCK(map_entries_lock); | ||
89 | |||
90 | /* | ||
91 | * For memory hotplug, there is no way to free memory map entries allocated | ||
92 | * by boot mem after the system is up. So when we hot-remove memory whose | ||
93 | * map entry is allocated by bootmem, we need to remember the storage and | ||
94 | * reuse it when the memory is hot-added again. | ||
95 | */ | ||
96 | static LIST_HEAD(map_entries_bootmem); | ||
97 | static DEFINE_SPINLOCK(map_entries_bootmem_lock); | ||
98 | |||
99 | |||
100 | static inline struct firmware_map_entry * | ||
101 | to_memmap_entry(struct kobject *kobj) | ||
102 | { | ||
103 | return container_of(kobj, struct firmware_map_entry, kobj); | ||
104 | } | ||
105 | |||
106 | static void __meminit release_firmware_map_entry(struct kobject *kobj) | ||
107 | { | ||
108 | struct firmware_map_entry *entry = to_memmap_entry(kobj); | ||
109 | |||
110 | if (PageReserved(virt_to_page(entry))) { | ||
111 | /* | ||
112 | * Remember the storage allocated by bootmem, and reuse it when | ||
113 | * the memory is hot-added again. The entry will be added to | ||
114 | * map_entries_bootmem here, and deleted from &map_entries in | ||
115 | * firmware_map_remove_entry(). | ||
116 | */ | ||
117 | if (firmware_map_find_entry(entry->start, entry->end, | ||
118 | entry->type)) { | ||
119 | spin_lock(&map_entries_bootmem_lock); | ||
120 | list_add(&entry->list, &map_entries_bootmem); | ||
121 | spin_unlock(&map_entries_bootmem_lock); | ||
122 | } | ||
123 | |||
124 | return; | ||
125 | } | ||
126 | |||
127 | kfree(entry); | ||
128 | } | ||
129 | |||
130 | static struct kobj_type __refdata memmap_ktype = { | ||
131 | .release = release_firmware_map_entry, | ||
83 | .sysfs_ops = &memmap_attr_ops, | 132 | .sysfs_ops = &memmap_attr_ops, |
84 | .default_attrs = def_attrs, | 133 | .default_attrs = def_attrs, |
85 | }; | 134 | }; |
@@ -88,13 +137,6 @@ static struct kobj_type memmap_ktype = { | |||
88 | * Registration functions ------------------------------------------------------ | 137 | * Registration functions ------------------------------------------------------ |
89 | */ | 138 | */ |
90 | 139 | ||
91 | /* | ||
92 | * Firmware memory map entries. No locking is needed because the | ||
93 | * firmware_map_add() and firmware_map_add_early() functions are called | ||
94 | * in firmware initialisation code in one single thread of execution. | ||
95 | */ | ||
96 | static LIST_HEAD(map_entries); | ||
97 | |||
98 | /** | 140 | /** |
99 | * firmware_map_add_entry() - Does the real work to add a firmware memmap entry. | 141 | * firmware_map_add_entry() - Does the real work to add a firmware memmap entry. |
100 | * @start: Start of the memory range. | 142 | * @start: Start of the memory range. |
@@ -118,11 +160,25 @@ static int firmware_map_add_entry(u64 start, u64 end, | |||
118 | INIT_LIST_HEAD(&entry->list); | 160 | INIT_LIST_HEAD(&entry->list); |
119 | kobject_init(&entry->kobj, &memmap_ktype); | 161 | kobject_init(&entry->kobj, &memmap_ktype); |
120 | 162 | ||
163 | spin_lock(&map_entries_lock); | ||
121 | list_add_tail(&entry->list, &map_entries); | 164 | list_add_tail(&entry->list, &map_entries); |
165 | spin_unlock(&map_entries_lock); | ||
122 | 166 | ||
123 | return 0; | 167 | return 0; |
124 | } | 168 | } |
125 | 169 | ||
170 | /** | ||
171 | * firmware_map_remove_entry() - Does the real work to remove a firmware | ||
172 | * memmap entry. | ||
173 | * @entry: removed entry. | ||
174 | * | ||
175 | * The caller must hold map_entries_lock, and release it properly. | ||
176 | **/ | ||
177 | static inline void firmware_map_remove_entry(struct firmware_map_entry *entry) | ||
178 | { | ||
179 | list_del(&entry->list); | ||
180 | } | ||
181 | |||
126 | /* | 182 | /* |
127 | * Add memmap entry on sysfs | 183 | * Add memmap entry on sysfs |
128 | */ | 184 | */ |
@@ -144,6 +200,78 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry) | |||
144 | return 0; | 200 | return 0; |
145 | } | 201 | } |
146 | 202 | ||
203 | /* | ||
204 | * Remove memmap entry on sysfs | ||
205 | */ | ||
206 | static inline void remove_sysfs_fw_map_entry(struct firmware_map_entry *entry) | ||
207 | { | ||
208 | kobject_put(&entry->kobj); | ||
209 | } | ||
210 | |||
211 | /* | ||
212 | * firmware_map_find_entry_in_list() - Search memmap entry in a given list. | ||
213 | * @start: Start of the memory range. | ||
214 | * @end: End of the memory range (exclusive). | ||
215 | * @type: Type of the memory range. | ||
216 | * @list: In which to find the entry. | ||
217 | * | ||
218 | * This function is to find the memmap entey of a given memory range in a | ||
219 | * given list. The caller must hold map_entries_lock, and must not release | ||
220 | * the lock until the processing of the returned entry has completed. | ||
221 | * | ||
222 | * Return: Pointer to the entry to be found on success, or NULL on failure. | ||
223 | */ | ||
224 | static struct firmware_map_entry * __meminit | ||
225 | firmware_map_find_entry_in_list(u64 start, u64 end, const char *type, | ||
226 | struct list_head *list) | ||
227 | { | ||
228 | struct firmware_map_entry *entry; | ||
229 | |||
230 | list_for_each_entry(entry, list, list) | ||
231 | if ((entry->start == start) && (entry->end == end) && | ||
232 | (!strcmp(entry->type, type))) { | ||
233 | return entry; | ||
234 | } | ||
235 | |||
236 | return NULL; | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | * firmware_map_find_entry() - Search memmap entry in map_entries. | ||
241 | * @start: Start of the memory range. | ||
242 | * @end: End of the memory range (exclusive). | ||
243 | * @type: Type of the memory range. | ||
244 | * | ||
245 | * This function is to find the memmap entey of a given memory range. | ||
246 | * The caller must hold map_entries_lock, and must not release the lock | ||
247 | * until the processing of the returned entry has completed. | ||
248 | * | ||
249 | * Return: Pointer to the entry to be found on success, or NULL on failure. | ||
250 | */ | ||
251 | static struct firmware_map_entry * __meminit | ||
252 | firmware_map_find_entry(u64 start, u64 end, const char *type) | ||
253 | { | ||
254 | return firmware_map_find_entry_in_list(start, end, type, &map_entries); | ||
255 | } | ||
256 | |||
257 | /* | ||
258 | * firmware_map_find_entry_bootmem() - Search memmap entry in map_entries_bootmem. | ||
259 | * @start: Start of the memory range. | ||
260 | * @end: End of the memory range (exclusive). | ||
261 | * @type: Type of the memory range. | ||
262 | * | ||
263 | * This function is similar to firmware_map_find_entry except that it find the | ||
264 | * given entry in map_entries_bootmem. | ||
265 | * | ||
266 | * Return: Pointer to the entry to be found on success, or NULL on failure. | ||
267 | */ | ||
268 | static struct firmware_map_entry * __meminit | ||
269 | firmware_map_find_entry_bootmem(u64 start, u64 end, const char *type) | ||
270 | { | ||
271 | return firmware_map_find_entry_in_list(start, end, type, | ||
272 | &map_entries_bootmem); | ||
273 | } | ||
274 | |||
147 | /** | 275 | /** |
148 | * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do | 276 | * firmware_map_add_hotplug() - Adds a firmware mapping entry when we do |
149 | * memory hotplug. | 277 | * memory hotplug. |
@@ -161,9 +289,19 @@ int __meminit firmware_map_add_hotplug(u64 start, u64 end, const char *type) | |||
161 | { | 289 | { |
162 | struct firmware_map_entry *entry; | 290 | struct firmware_map_entry *entry; |
163 | 291 | ||
164 | entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC); | 292 | entry = firmware_map_find_entry_bootmem(start, end, type); |
165 | if (!entry) | 293 | if (!entry) { |
166 | return -ENOMEM; | 294 | entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC); |
295 | if (!entry) | ||
296 | return -ENOMEM; | ||
297 | } else { | ||
298 | /* Reuse storage allocated by bootmem. */ | ||
299 | spin_lock(&map_entries_bootmem_lock); | ||
300 | list_del(&entry->list); | ||
301 | spin_unlock(&map_entries_bootmem_lock); | ||
302 | |||
303 | memset(entry, 0, sizeof(*entry)); | ||
304 | } | ||
167 | 305 | ||
168 | firmware_map_add_entry(start, end, type, entry); | 306 | firmware_map_add_entry(start, end, type, entry); |
169 | /* create the memmap entry */ | 307 | /* create the memmap entry */ |
@@ -196,6 +334,36 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type) | |||
196 | return firmware_map_add_entry(start, end, type, entry); | 334 | return firmware_map_add_entry(start, end, type, entry); |
197 | } | 335 | } |
198 | 336 | ||
337 | /** | ||
338 | * firmware_map_remove() - remove a firmware mapping entry | ||
339 | * @start: Start of the memory range. | ||
340 | * @end: End of the memory range. | ||
341 | * @type: Type of the memory range. | ||
342 | * | ||
343 | * removes a firmware mapping entry. | ||
344 | * | ||
345 | * Returns 0 on success, or -EINVAL if no entry. | ||
346 | **/ | ||
347 | int __meminit firmware_map_remove(u64 start, u64 end, const char *type) | ||
348 | { | ||
349 | struct firmware_map_entry *entry; | ||
350 | |||
351 | spin_lock(&map_entries_lock); | ||
352 | entry = firmware_map_find_entry(start, end - 1, type); | ||
353 | if (!entry) { | ||
354 | spin_unlock(&map_entries_lock); | ||
355 | return -EINVAL; | ||
356 | } | ||
357 | |||
358 | firmware_map_remove_entry(entry); | ||
359 | spin_unlock(&map_entries_lock); | ||
360 | |||
361 | /* remove the memmap entry */ | ||
362 | remove_sysfs_fw_map_entry(entry); | ||
363 | |||
364 | return 0; | ||
365 | } | ||
366 | |||
199 | /* | 367 | /* |
200 | * Sysfs functions ------------------------------------------------------------- | 368 | * Sysfs functions ------------------------------------------------------------- |
201 | */ | 369 | */ |
@@ -217,8 +385,10 @@ static ssize_t type_show(struct firmware_map_entry *entry, char *buf) | |||
217 | return snprintf(buf, PAGE_SIZE, "%s\n", entry->type); | 385 | return snprintf(buf, PAGE_SIZE, "%s\n", entry->type); |
218 | } | 386 | } |
219 | 387 | ||
220 | #define to_memmap_attr(_attr) container_of(_attr, struct memmap_attribute, attr) | 388 | static inline struct memmap_attribute *to_memmap_attr(struct attribute *attr) |
221 | #define to_memmap_entry(obj) container_of(obj, struct firmware_map_entry, kobj) | 389 | { |
390 | return container_of(attr, struct memmap_attribute, attr); | ||
391 | } | ||
222 | 392 | ||
223 | static ssize_t memmap_attr_show(struct kobject *kobj, | 393 | static ssize_t memmap_attr_show(struct kobject *kobj, |
224 | struct attribute *attr, char *buf) | 394 | struct attribute *attr, char *buf) |
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index d247a35da3c6..7b17a1fdeaf9 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c | |||
@@ -25,8 +25,8 @@ struct shadow_info { | |||
25 | /* | 25 | /* |
26 | * It would be nice if we scaled with the size of transaction. | 26 | * It would be nice if we scaled with the size of transaction. |
27 | */ | 27 | */ |
28 | #define HASH_SIZE 256 | 28 | #define DM_HASH_SIZE 256 |
29 | #define HASH_MASK (HASH_SIZE - 1) | 29 | #define DM_HASH_MASK (DM_HASH_SIZE - 1) |
30 | 30 | ||
31 | struct dm_transaction_manager { | 31 | struct dm_transaction_manager { |
32 | int is_clone; | 32 | int is_clone; |
@@ -36,7 +36,7 @@ struct dm_transaction_manager { | |||
36 | struct dm_space_map *sm; | 36 | struct dm_space_map *sm; |
37 | 37 | ||
38 | spinlock_t lock; | 38 | spinlock_t lock; |
39 | struct hlist_head buckets[HASH_SIZE]; | 39 | struct hlist_head buckets[DM_HASH_SIZE]; |
40 | }; | 40 | }; |
41 | 41 | ||
42 | /*----------------------------------------------------------------*/ | 42 | /*----------------------------------------------------------------*/ |
@@ -44,7 +44,7 @@ struct dm_transaction_manager { | |||
44 | static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) | 44 | static int is_shadow(struct dm_transaction_manager *tm, dm_block_t b) |
45 | { | 45 | { |
46 | int r = 0; | 46 | int r = 0; |
47 | unsigned bucket = dm_hash_block(b, HASH_MASK); | 47 | unsigned bucket = dm_hash_block(b, DM_HASH_MASK); |
48 | struct shadow_info *si; | 48 | struct shadow_info *si; |
49 | struct hlist_node *n; | 49 | struct hlist_node *n; |
50 | 50 | ||
@@ -71,7 +71,7 @@ static void insert_shadow(struct dm_transaction_manager *tm, dm_block_t b) | |||
71 | si = kmalloc(sizeof(*si), GFP_NOIO); | 71 | si = kmalloc(sizeof(*si), GFP_NOIO); |
72 | if (si) { | 72 | if (si) { |
73 | si->where = b; | 73 | si->where = b; |
74 | bucket = dm_hash_block(b, HASH_MASK); | 74 | bucket = dm_hash_block(b, DM_HASH_MASK); |
75 | spin_lock(&tm->lock); | 75 | spin_lock(&tm->lock); |
76 | hlist_add_head(&si->hlist, tm->buckets + bucket); | 76 | hlist_add_head(&si->hlist, tm->buckets + bucket); |
77 | spin_unlock(&tm->lock); | 77 | spin_unlock(&tm->lock); |
@@ -86,7 +86,7 @@ static void wipe_shadow_table(struct dm_transaction_manager *tm) | |||
86 | int i; | 86 | int i; |
87 | 87 | ||
88 | spin_lock(&tm->lock); | 88 | spin_lock(&tm->lock); |
89 | for (i = 0; i < HASH_SIZE; i++) { | 89 | for (i = 0; i < DM_HASH_SIZE; i++) { |
90 | bucket = tm->buckets + i; | 90 | bucket = tm->buckets + i; |
91 | hlist_for_each_entry_safe(si, n, tmp, bucket, hlist) | 91 | hlist_for_each_entry_safe(si, n, tmp, bucket, hlist) |
92 | kfree(si); | 92 | kfree(si); |
@@ -115,7 +115,7 @@ static struct dm_transaction_manager *dm_tm_create(struct dm_block_manager *bm, | |||
115 | tm->sm = sm; | 115 | tm->sm = sm; |
116 | 116 | ||
117 | spin_lock_init(&tm->lock); | 117 | spin_lock_init(&tm->lock); |
118 | for (i = 0; i < HASH_SIZE; i++) | 118 | for (i = 0; i < DM_HASH_SIZE; i++) |
119 | INIT_HLIST_HEAD(tm->buckets + i); | 119 | INIT_HLIST_HEAD(tm->buckets + i); |
120 | 120 | ||
121 | return tm; | 121 | return tm; |
diff --git a/drivers/staging/zcache/zbud.c b/drivers/staging/zcache/zbud.c index 328c397ea5dc..fdff5c6a0239 100644 --- a/drivers/staging/zcache/zbud.c +++ b/drivers/staging/zcache/zbud.c | |||
@@ -404,7 +404,7 @@ static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage, | |||
404 | else | 404 | else |
405 | zbud_pers_pageframes--; | 405 | zbud_pers_pageframes--; |
406 | zbudpage_spin_unlock(zbudpage); | 406 | zbudpage_spin_unlock(zbudpage); |
407 | reset_page_mapcount(page); | 407 | page_mapcount_reset(page); |
408 | init_page_count(page); | 408 | init_page_count(page); |
409 | page->index = 0; | 409 | page->index = 0; |
410 | return page; | 410 | return page; |
diff --git a/drivers/staging/zsmalloc/zsmalloc-main.c b/drivers/staging/zsmalloc/zsmalloc-main.c index 06f73a93a44d..e78d262c5249 100644 --- a/drivers/staging/zsmalloc/zsmalloc-main.c +++ b/drivers/staging/zsmalloc/zsmalloc-main.c | |||
@@ -472,7 +472,7 @@ static void reset_page(struct page *page) | |||
472 | set_page_private(page, 0); | 472 | set_page_private(page, 0); |
473 | page->mapping = NULL; | 473 | page->mapping = NULL; |
474 | page->freelist = NULL; | 474 | page->freelist = NULL; |
475 | reset_page_mapcount(page); | 475 | page_mapcount_reset(page); |
476 | } | 476 | } |
477 | 477 | ||
478 | static void free_zspage(struct page *first_page) | 478 | static void free_zspage(struct page *first_page) |
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 1775ad471edd..5480352f984d 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c | |||
@@ -5177,6 +5177,7 @@ int usb_reset_device(struct usb_device *udev) | |||
5177 | { | 5177 | { |
5178 | int ret; | 5178 | int ret; |
5179 | int i; | 5179 | int i; |
5180 | unsigned int noio_flag; | ||
5180 | struct usb_host_config *config = udev->actconfig; | 5181 | struct usb_host_config *config = udev->actconfig; |
5181 | 5182 | ||
5182 | if (udev->state == USB_STATE_NOTATTACHED || | 5183 | if (udev->state == USB_STATE_NOTATTACHED || |
@@ -5186,6 +5187,17 @@ int usb_reset_device(struct usb_device *udev) | |||
5186 | return -EINVAL; | 5187 | return -EINVAL; |
5187 | } | 5188 | } |
5188 | 5189 | ||
5190 | /* | ||
5191 | * Don't allocate memory with GFP_KERNEL in current | ||
5192 | * context to avoid possible deadlock if usb mass | ||
5193 | * storage interface or usbnet interface(iSCSI case) | ||
5194 | * is included in current configuration. The easist | ||
5195 | * approach is to do it for every device reset, | ||
5196 | * because the device 'memalloc_noio' flag may have | ||
5197 | * not been set before reseting the usb device. | ||
5198 | */ | ||
5199 | noio_flag = memalloc_noio_save(); | ||
5200 | |||
5189 | /* Prevent autosuspend during the reset */ | 5201 | /* Prevent autosuspend during the reset */ |
5190 | usb_autoresume_device(udev); | 5202 | usb_autoresume_device(udev); |
5191 | 5203 | ||
@@ -5230,6 +5242,7 @@ int usb_reset_device(struct usb_device *udev) | |||
5230 | } | 5242 | } |
5231 | 5243 | ||
5232 | usb_autosuspend_device(udev); | 5244 | usb_autosuspend_device(udev); |
5245 | memalloc_noio_restore(noio_flag); | ||
5233 | return ret; | 5246 | return ret; |
5234 | } | 5247 | } |
5235 | EXPORT_SYMBOL_GPL(usb_reset_device); | 5248 | EXPORT_SYMBOL_GPL(usb_reset_device); |
@@ -101,7 +101,7 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
101 | struct aio_ring *ring; | 101 | struct aio_ring *ring; |
102 | struct aio_ring_info *info = &ctx->ring_info; | 102 | struct aio_ring_info *info = &ctx->ring_info; |
103 | unsigned nr_events = ctx->max_reqs; | 103 | unsigned nr_events = ctx->max_reqs; |
104 | unsigned long size; | 104 | unsigned long size, populate; |
105 | int nr_pages; | 105 | int nr_pages; |
106 | 106 | ||
107 | /* Compensate for the ring buffer's head/tail overlap entry */ | 107 | /* Compensate for the ring buffer's head/tail overlap entry */ |
@@ -129,7 +129,8 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
129 | down_write(&ctx->mm->mmap_sem); | 129 | down_write(&ctx->mm->mmap_sem); |
130 | info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, | 130 | info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, |
131 | PROT_READ|PROT_WRITE, | 131 | PROT_READ|PROT_WRITE, |
132 | MAP_ANONYMOUS|MAP_PRIVATE, 0); | 132 | MAP_ANONYMOUS|MAP_PRIVATE, 0, |
133 | &populate); | ||
133 | if (IS_ERR((void *)info->mmap_base)) { | 134 | if (IS_ERR((void *)info->mmap_base)) { |
134 | up_write(&ctx->mm->mmap_sem); | 135 | up_write(&ctx->mm->mmap_sem); |
135 | info->mmap_size = 0; | 136 | info->mmap_size = 0; |
@@ -147,6 +148,8 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
147 | aio_free_ring(ctx); | 148 | aio_free_ring(ctx); |
148 | return -EAGAIN; | 149 | return -EAGAIN; |
149 | } | 150 | } |
151 | if (populate) | ||
152 | mm_populate(info->mmap_base, populate); | ||
150 | 153 | ||
151 | ctx->user_id = info->mmap_base; | 154 | ctx->user_id = info->mmap_base; |
152 | 155 | ||
diff --git a/fs/buffer.c b/fs/buffer.c index 2ea9cd44aeae..62169c192c21 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -3227,7 +3227,7 @@ static struct kmem_cache *bh_cachep __read_mostly; | |||
3227 | * Once the number of bh's in the machine exceeds this level, we start | 3227 | * Once the number of bh's in the machine exceeds this level, we start |
3228 | * stripping them in writeback. | 3228 | * stripping them in writeback. |
3229 | */ | 3229 | */ |
3230 | static int max_buffer_heads; | 3230 | static unsigned long max_buffer_heads; |
3231 | 3231 | ||
3232 | int buffer_heads_over_limit; | 3232 | int buffer_heads_over_limit; |
3233 | 3233 | ||
@@ -3343,7 +3343,7 @@ EXPORT_SYMBOL(bh_submit_read); | |||
3343 | 3343 | ||
3344 | void __init buffer_init(void) | 3344 | void __init buffer_init(void) |
3345 | { | 3345 | { |
3346 | int nrpages; | 3346 | unsigned long nrpages; |
3347 | 3347 | ||
3348 | bh_cachep = kmem_cache_create("buffer_head", | 3348 | bh_cachep = kmem_cache_create("buffer_head", |
3349 | sizeof(struct buffer_head), 0, | 3349 | sizeof(struct buffer_head), 0, |
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ac8ed96c4199..499e957510e7 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c | |||
@@ -151,7 +151,7 @@ get_nfs4_file(struct nfs4_file *fi) | |||
151 | } | 151 | } |
152 | 152 | ||
153 | static int num_delegations; | 153 | static int num_delegations; |
154 | unsigned int max_delegations; | 154 | unsigned long max_delegations; |
155 | 155 | ||
156 | /* | 156 | /* |
157 | * Open owner state (share locks) | 157 | * Open owner state (share locks) |
@@ -700,8 +700,8 @@ static int nfsd4_get_drc_mem(int slotsize, u32 num) | |||
700 | num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION); | 700 | num = min_t(u32, num, NFSD_MAX_SLOTS_PER_SESSION); |
701 | 701 | ||
702 | spin_lock(&nfsd_drc_lock); | 702 | spin_lock(&nfsd_drc_lock); |
703 | avail = min_t(int, NFSD_MAX_MEM_PER_SESSION, | 703 | avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, |
704 | nfsd_drc_max_mem - nfsd_drc_mem_used); | 704 | nfsd_drc_max_mem - nfsd_drc_mem_used); |
705 | num = min_t(int, num, avail / slotsize); | 705 | num = min_t(int, num, avail / slotsize); |
706 | nfsd_drc_mem_used += num * slotsize; | 706 | nfsd_drc_mem_used += num * slotsize; |
707 | spin_unlock(&nfsd_drc_lock); | 707 | spin_unlock(&nfsd_drc_lock); |
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index de23db255c69..07a473fd49bc 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h | |||
@@ -56,8 +56,8 @@ extern struct svc_version nfsd_version2, nfsd_version3, | |||
56 | extern u32 nfsd_supported_minorversion; | 56 | extern u32 nfsd_supported_minorversion; |
57 | extern struct mutex nfsd_mutex; | 57 | extern struct mutex nfsd_mutex; |
58 | extern spinlock_t nfsd_drc_lock; | 58 | extern spinlock_t nfsd_drc_lock; |
59 | extern unsigned int nfsd_drc_max_mem; | 59 | extern unsigned long nfsd_drc_max_mem; |
60 | extern unsigned int nfsd_drc_mem_used; | 60 | extern unsigned long nfsd_drc_mem_used; |
61 | 61 | ||
62 | extern const struct seq_operations nfs_exports_op; | 62 | extern const struct seq_operations nfs_exports_op; |
63 | 63 | ||
@@ -106,7 +106,7 @@ static inline int nfsd_v4client(struct svc_rqst *rq) | |||
106 | * NFSv4 State | 106 | * NFSv4 State |
107 | */ | 107 | */ |
108 | #ifdef CONFIG_NFSD_V4 | 108 | #ifdef CONFIG_NFSD_V4 |
109 | extern unsigned int max_delegations; | 109 | extern unsigned long max_delegations; |
110 | void nfs4_state_init(void); | 110 | void nfs4_state_init(void); |
111 | int nfsd4_init_slabs(void); | 111 | int nfsd4_init_slabs(void); |
112 | void nfsd4_free_slabs(void); | 112 | void nfsd4_free_slabs(void); |
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index cee62ab9d4a3..be7af509930c 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c | |||
@@ -59,8 +59,8 @@ DEFINE_MUTEX(nfsd_mutex); | |||
59 | * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. | 59 | * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage. |
60 | */ | 60 | */ |
61 | spinlock_t nfsd_drc_lock; | 61 | spinlock_t nfsd_drc_lock; |
62 | unsigned int nfsd_drc_max_mem; | 62 | unsigned long nfsd_drc_max_mem; |
63 | unsigned int nfsd_drc_mem_used; | 63 | unsigned long nfsd_drc_mem_used; |
64 | 64 | ||
65 | #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) | 65 | #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) |
66 | static struct svc_stat nfsd_acl_svcstats; | 66 | static struct svc_stat nfsd_acl_svcstats; |
@@ -342,7 +342,7 @@ static void set_max_drc(void) | |||
342 | >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; | 342 | >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; |
343 | nfsd_drc_mem_used = 0; | 343 | nfsd_drc_mem_used = 0; |
344 | spin_lock_init(&nfsd_drc_lock); | 344 | spin_lock_init(&nfsd_drc_lock); |
345 | dprintk("%s nfsd_drc_max_mem %u \n", __func__, nfsd_drc_max_mem); | 345 | dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem); |
346 | } | 346 | } |
347 | 347 | ||
348 | static int nfsd_get_default_max_blksize(void) | 348 | static int nfsd_get_default_max_blksize(void) |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 80e4645f7990..1efaaa19c4f3 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
@@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
40 | * sysctl_overcommit_ratio / 100) + total_swap_pages; | 40 | * sysctl_overcommit_ratio / 100) + total_swap_pages; |
41 | 41 | ||
42 | cached = global_page_state(NR_FILE_PAGES) - | 42 | cached = global_page_state(NR_FILE_PAGES) - |
43 | total_swapcache_pages - i.bufferram; | 43 | total_swapcache_pages() - i.bufferram; |
44 | if (cached < 0) | 44 | if (cached < 0) |
45 | cached = 0; | 45 | cached = 0; |
46 | 46 | ||
@@ -109,7 +109,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
109 | K(i.freeram), | 109 | K(i.freeram), |
110 | K(i.bufferram), | 110 | K(i.bufferram), |
111 | K(cached), | 111 | K(cached), |
112 | K(total_swapcache_pages), | 112 | K(total_swapcache_pages()), |
113 | K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), | 113 | K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), |
114 | K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), | 114 | K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), |
115 | K(pages[LRU_ACTIVE_ANON]), | 115 | K(pages[LRU_ACTIVE_ANON]), |
@@ -158,7 +158,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
158 | vmi.used >> 10, | 158 | vmi.used >> 10, |
159 | vmi.largest_chunk >> 10 | 159 | vmi.largest_chunk >> 10 |
160 | #ifdef CONFIG_MEMORY_FAILURE | 160 | #ifdef CONFIG_MEMORY_FAILURE |
161 | ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) | 161 | ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) |
162 | #endif | 162 | #endif |
163 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 163 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
164 | ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * | 164 | ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * |
diff --git a/include/linux/acpi.h b/include/linux/acpi.h index bcbdd7484e58..f46cfd73a553 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h | |||
@@ -485,6 +485,14 @@ static inline bool acpi_driver_match_device(struct device *dev, | |||
485 | 485 | ||
486 | #endif /* !CONFIG_ACPI */ | 486 | #endif /* !CONFIG_ACPI */ |
487 | 487 | ||
488 | #ifdef CONFIG_ACPI_NUMA | ||
489 | void __init early_parse_srat(void); | ||
490 | #else | ||
491 | static inline void early_parse_srat(void) | ||
492 | { | ||
493 | } | ||
494 | #endif | ||
495 | |||
488 | #ifdef CONFIG_ACPI | 496 | #ifdef CONFIG_ACPI |
489 | void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, | 497 | void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, |
490 | u32 pm1a_ctrl, u32 pm1b_ctrl)); | 498 | u32 pm1a_ctrl, u32 pm1b_ctrl)); |
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 3cd16ba82f15..cdc3bab01832 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h | |||
@@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat, | |||
53 | unsigned long size); | 53 | unsigned long size); |
54 | extern void free_bootmem(unsigned long physaddr, unsigned long size); | 54 | extern void free_bootmem(unsigned long physaddr, unsigned long size); |
55 | extern void free_bootmem_late(unsigned long physaddr, unsigned long size); | 55 | extern void free_bootmem_late(unsigned long physaddr, unsigned long size); |
56 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | ||
56 | 57 | ||
57 | /* | 58 | /* |
58 | * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, | 59 | * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index cc7bddeaf553..091d72e70d8a 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -23,7 +23,7 @@ extern int fragmentation_index(struct zone *zone, unsigned int order); | |||
23 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, | 23 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, |
24 | int order, gfp_t gfp_mask, nodemask_t *mask, | 24 | int order, gfp_t gfp_mask, nodemask_t *mask, |
25 | bool sync, bool *contended); | 25 | bool sync, bool *contended); |
26 | extern int compact_pgdat(pg_data_t *pgdat, int order); | 26 | extern void compact_pgdat(pg_data_t *pgdat, int order); |
27 | extern void reset_isolation_suitable(pg_data_t *pgdat); | 27 | extern void reset_isolation_suitable(pg_data_t *pgdat); |
28 | extern unsigned long compaction_suitable(struct zone *zone, int order); | 28 | extern unsigned long compaction_suitable(struct zone *zone, int order); |
29 | 29 | ||
@@ -80,9 +80,8 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
80 | return COMPACT_CONTINUE; | 80 | return COMPACT_CONTINUE; |
81 | } | 81 | } |
82 | 82 | ||
83 | static inline int compact_pgdat(pg_data_t *pgdat, int order) | 83 | static inline void compact_pgdat(pg_data_t *pgdat, int order) |
84 | { | 84 | { |
85 | return COMPACT_CONTINUE; | ||
86 | } | 85 | } |
87 | 86 | ||
88 | static inline void reset_isolation_suitable(pg_data_t *pgdat) | 87 | static inline void reset_isolation_suitable(pg_data_t *pgdat) |
diff --git a/include/linux/firmware-map.h b/include/linux/firmware-map.h index 43fe52fcef0f..71d4fa721db9 100644 --- a/include/linux/firmware-map.h +++ b/include/linux/firmware-map.h | |||
@@ -25,6 +25,7 @@ | |||
25 | 25 | ||
26 | int firmware_map_add_early(u64 start, u64 end, const char *type); | 26 | int firmware_map_add_early(u64 start, u64 end, const char *type); |
27 | int firmware_map_add_hotplug(u64 start, u64 end, const char *type); | 27 | int firmware_map_add_hotplug(u64 start, u64 end, const char *type); |
28 | int firmware_map_remove(u64 start, u64 end, const char *type); | ||
28 | 29 | ||
29 | #else /* CONFIG_FIRMWARE_MEMMAP */ | 30 | #else /* CONFIG_FIRMWARE_MEMMAP */ |
30 | 31 | ||
@@ -38,6 +39,11 @@ static inline int firmware_map_add_hotplug(u64 start, u64 end, const char *type) | |||
38 | return 0; | 39 | return 0; |
39 | } | 40 | } |
40 | 41 | ||
42 | static inline int firmware_map_remove(u64 start, u64 end, const char *type) | ||
43 | { | ||
44 | return 0; | ||
45 | } | ||
46 | |||
41 | #endif /* CONFIG_FIRMWARE_MEMMAP */ | 47 | #endif /* CONFIG_FIRMWARE_MEMMAP */ |
42 | 48 | ||
43 | #endif /* _LINUX_FIRMWARE_MAP_H */ | 49 | #endif /* _LINUX_FIRMWARE_MAP_H */ |
diff --git a/include/linux/highmem.h b/include/linux/highmem.h index ef788b5b4a35..7fb31da45d03 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h | |||
@@ -219,12 +219,6 @@ static inline void zero_user(struct page *page, | |||
219 | zero_user_segments(page, start, start + size, 0, 0); | 219 | zero_user_segments(page, start, start + size, 0, 0); |
220 | } | 220 | } |
221 | 221 | ||
222 | static inline void __deprecated memclear_highpage_flush(struct page *page, | ||
223 | unsigned int offset, unsigned int size) | ||
224 | { | ||
225 | zero_user(page, offset, size); | ||
226 | } | ||
227 | |||
228 | #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE | 222 | #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE |
229 | 223 | ||
230 | static inline void copy_user_highpage(struct page *to, struct page *from, | 224 | static inline void copy_user_highpage(struct page *to, struct page *from, |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1d76f8ca90f0..ee1c244a62a1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -113,7 +113,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma, | |||
113 | do { \ | 113 | do { \ |
114 | pmd_t *____pmd = (__pmd); \ | 114 | pmd_t *____pmd = (__pmd); \ |
115 | anon_vma_lock_write(__anon_vma); \ | 115 | anon_vma_lock_write(__anon_vma); \ |
116 | anon_vma_unlock(__anon_vma); \ | 116 | anon_vma_unlock_write(__anon_vma); \ |
117 | BUG_ON(pmd_trans_splitting(*____pmd) || \ | 117 | BUG_ON(pmd_trans_splitting(*____pmd) || \ |
118 | pmd_trans_huge(*____pmd)); \ | 118 | pmd_trans_huge(*____pmd)); \ |
119 | } while (0) | 119 | } while (0) |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0c80d3f57a5b..eedc334fb6f5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -43,9 +43,9 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, | |||
43 | #endif | 43 | #endif |
44 | 44 | ||
45 | int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); | 45 | int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); |
46 | int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, | 46 | long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, |
47 | struct page **, struct vm_area_struct **, | 47 | struct page **, struct vm_area_struct **, |
48 | unsigned long *, int *, int, unsigned int flags); | 48 | unsigned long *, unsigned long *, long, unsigned int); |
49 | void unmap_hugepage_range(struct vm_area_struct *, | 49 | void unmap_hugepage_range(struct vm_area_struct *, |
50 | unsigned long, unsigned long, struct page *); | 50 | unsigned long, unsigned long, struct page *); |
51 | void __unmap_hugepage_range_final(struct mmu_gather *tlb, | 51 | void __unmap_hugepage_range_final(struct mmu_gather *tlb, |
diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 3319a6967626..45c9b6a17bcb 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h | |||
@@ -16,9 +16,6 @@ | |||
16 | struct stable_node; | 16 | struct stable_node; |
17 | struct mem_cgroup; | 17 | struct mem_cgroup; |
18 | 18 | ||
19 | struct page *ksm_does_need_to_copy(struct page *page, | ||
20 | struct vm_area_struct *vma, unsigned long address); | ||
21 | |||
22 | #ifdef CONFIG_KSM | 19 | #ifdef CONFIG_KSM |
23 | int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | 20 | int ksm_madvise(struct vm_area_struct *vma, unsigned long start, |
24 | unsigned long end, int advice, unsigned long *vm_flags); | 21 | unsigned long end, int advice, unsigned long *vm_flags); |
@@ -73,15 +70,8 @@ static inline void set_page_stable_node(struct page *page, | |||
73 | * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, | 70 | * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, |
74 | * but what if the vma was unmerged while the page was swapped out? | 71 | * but what if the vma was unmerged while the page was swapped out? |
75 | */ | 72 | */ |
76 | static inline int ksm_might_need_to_copy(struct page *page, | 73 | struct page *ksm_might_need_to_copy(struct page *page, |
77 | struct vm_area_struct *vma, unsigned long address) | 74 | struct vm_area_struct *vma, unsigned long address); |
78 | { | ||
79 | struct anon_vma *anon_vma = page_anon_vma(page); | ||
80 | |||
81 | return anon_vma && | ||
82 | (anon_vma->root != vma->anon_vma->root || | ||
83 | page->index != linear_page_index(vma, address)); | ||
84 | } | ||
85 | 75 | ||
86 | int page_referenced_ksm(struct page *page, | 76 | int page_referenced_ksm(struct page *page, |
87 | struct mem_cgroup *memcg, unsigned long *vm_flags); | 77 | struct mem_cgroup *memcg, unsigned long *vm_flags); |
@@ -113,10 +103,10 @@ static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
113 | return 0; | 103 | return 0; |
114 | } | 104 | } |
115 | 105 | ||
116 | static inline int ksm_might_need_to_copy(struct page *page, | 106 | static inline struct page *ksm_might_need_to_copy(struct page *page, |
117 | struct vm_area_struct *vma, unsigned long address) | 107 | struct vm_area_struct *vma, unsigned long address) |
118 | { | 108 | { |
119 | return 0; | 109 | return page; |
120 | } | 110 | } |
121 | 111 | ||
122 | static inline int page_referenced_ksm(struct page *page, | 112 | static inline int page_referenced_ksm(struct page *page, |
diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f388203db7e8..3e5ecb2d790e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h | |||
@@ -42,6 +42,7 @@ struct memblock { | |||
42 | 42 | ||
43 | extern struct memblock memblock; | 43 | extern struct memblock memblock; |
44 | extern int memblock_debug; | 44 | extern int memblock_debug; |
45 | extern struct movablemem_map movablemem_map; | ||
45 | 46 | ||
46 | #define memblock_dbg(fmt, ...) \ | 47 | #define memblock_dbg(fmt, ...) \ |
47 | if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) | 48 | if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) |
@@ -60,6 +61,7 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); | |||
60 | void memblock_trim_memory(phys_addr_t align); | 61 | void memblock_trim_memory(phys_addr_t align); |
61 | 62 | ||
62 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 63 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
64 | |||
63 | void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, | 65 | void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, |
64 | unsigned long *out_end_pfn, int *out_nid); | 66 | unsigned long *out_end_pfn, int *out_nid); |
65 | 67 | ||
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 28bd5fa2ff2e..d6183f06d8c1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -116,7 +116,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); | |||
116 | * For memory reclaim. | 116 | * For memory reclaim. |
117 | */ | 117 | */ |
118 | int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec); | 118 | int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec); |
119 | int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec); | ||
120 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); | 119 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); |
121 | unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); | 120 | unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); |
122 | void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); | 121 | void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); |
@@ -321,12 +320,6 @@ mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
321 | return 1; | 320 | return 1; |
322 | } | 321 | } |
323 | 322 | ||
324 | static inline int | ||
325 | mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) | ||
326 | { | ||
327 | return 1; | ||
328 | } | ||
329 | |||
330 | static inline unsigned long | 323 | static inline unsigned long |
331 | mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) | 324 | mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) |
332 | { | 325 | { |
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4a45c4e50025..b6a3be7d47bf 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h | |||
@@ -96,6 +96,7 @@ extern void __online_page_free(struct page *page); | |||
96 | 96 | ||
97 | #ifdef CONFIG_MEMORY_HOTREMOVE | 97 | #ifdef CONFIG_MEMORY_HOTREMOVE |
98 | extern bool is_pageblock_removable_nolock(struct page *page); | 98 | extern bool is_pageblock_removable_nolock(struct page *page); |
99 | extern int arch_remove_memory(u64 start, u64 size); | ||
99 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 100 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
100 | 101 | ||
101 | /* reasonably generic interface to expand the physical pages in a zone */ | 102 | /* reasonably generic interface to expand the physical pages in a zone */ |
@@ -173,17 +174,16 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) | |||
173 | #endif /* CONFIG_NUMA */ | 174 | #endif /* CONFIG_NUMA */ |
174 | #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ | 175 | #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ |
175 | 176 | ||
176 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 177 | #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE |
178 | extern void register_page_bootmem_info_node(struct pglist_data *pgdat); | ||
179 | #else | ||
177 | static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) | 180 | static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) |
178 | { | 181 | { |
179 | } | 182 | } |
180 | static inline void put_page_bootmem(struct page *page) | ||
181 | { | ||
182 | } | ||
183 | #else | ||
184 | extern void register_page_bootmem_info_node(struct pglist_data *pgdat); | ||
185 | extern void put_page_bootmem(struct page *page); | ||
186 | #endif | 183 | #endif |
184 | extern void put_page_bootmem(struct page *page); | ||
185 | extern void get_page_bootmem(unsigned long ingo, struct page *page, | ||
186 | unsigned long type); | ||
187 | 187 | ||
188 | /* | 188 | /* |
189 | * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug | 189 | * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug |
@@ -233,6 +233,7 @@ static inline void unlock_memory_hotplug(void) {} | |||
233 | #ifdef CONFIG_MEMORY_HOTREMOVE | 233 | #ifdef CONFIG_MEMORY_HOTREMOVE |
234 | 234 | ||
235 | extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); | 235 | extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); |
236 | extern void try_offline_node(int nid); | ||
236 | 237 | ||
237 | #else | 238 | #else |
238 | static inline int is_mem_section_removable(unsigned long pfn, | 239 | static inline int is_mem_section_removable(unsigned long pfn, |
@@ -240,6 +241,8 @@ static inline int is_mem_section_removable(unsigned long pfn, | |||
240 | { | 241 | { |
241 | return 0; | 242 | return 0; |
242 | } | 243 | } |
244 | |||
245 | static inline void try_offline_node(int nid) {} | ||
243 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 246 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
244 | 247 | ||
245 | extern int mem_online_node(int nid); | 248 | extern int mem_online_node(int nid); |
@@ -247,7 +250,8 @@ extern int add_memory(int nid, u64 start, u64 size); | |||
247 | extern int arch_add_memory(int nid, u64 start, u64 size); | 250 | extern int arch_add_memory(int nid, u64 start, u64 size); |
248 | extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); | 251 | extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); |
249 | extern int offline_memory_block(struct memory_block *mem); | 252 | extern int offline_memory_block(struct memory_block *mem); |
250 | extern int remove_memory(u64 start, u64 size); | 253 | extern bool is_memblock_offlined(struct memory_block *mem); |
254 | extern int remove_memory(int nid, u64 start, u64 size); | ||
251 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | 255 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, |
252 | int nr_pages); | 256 | int nr_pages); |
253 | extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms); | 257 | extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms); |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 1e9f627967a3..a405d3dc0f61 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -40,11 +40,9 @@ extern void putback_movable_pages(struct list_head *l); | |||
40 | extern int migrate_page(struct address_space *, | 40 | extern int migrate_page(struct address_space *, |
41 | struct page *, struct page *, enum migrate_mode); | 41 | struct page *, struct page *, enum migrate_mode); |
42 | extern int migrate_pages(struct list_head *l, new_page_t x, | 42 | extern int migrate_pages(struct list_head *l, new_page_t x, |
43 | unsigned long private, bool offlining, | 43 | unsigned long private, enum migrate_mode mode, int reason); |
44 | enum migrate_mode mode, int reason); | ||
45 | extern int migrate_huge_page(struct page *, new_page_t x, | 44 | extern int migrate_huge_page(struct page *, new_page_t x, |
46 | unsigned long private, bool offlining, | 45 | unsigned long private, enum migrate_mode mode); |
47 | enum migrate_mode mode); | ||
48 | 46 | ||
49 | extern int fail_migrate_page(struct address_space *, | 47 | extern int fail_migrate_page(struct address_space *, |
50 | struct page *, struct page *); | 48 | struct page *, struct page *); |
@@ -62,11 +60,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
62 | static inline void putback_lru_pages(struct list_head *l) {} | 60 | static inline void putback_lru_pages(struct list_head *l) {} |
63 | static inline void putback_movable_pages(struct list_head *l) {} | 61 | static inline void putback_movable_pages(struct list_head *l) {} |
64 | static inline int migrate_pages(struct list_head *l, new_page_t x, | 62 | static inline int migrate_pages(struct list_head *l, new_page_t x, |
65 | unsigned long private, bool offlining, | 63 | unsigned long private, enum migrate_mode mode, int reason) |
66 | enum migrate_mode mode, int reason) { return -ENOSYS; } | 64 | { return -ENOSYS; } |
67 | static inline int migrate_huge_page(struct page *page, new_page_t x, | 65 | static inline int migrate_huge_page(struct page *page, new_page_t x, |
68 | unsigned long private, bool offlining, | 66 | unsigned long private, enum migrate_mode mode) |
69 | enum migrate_mode mode) { return -ENOSYS; } | 67 | { return -ENOSYS; } |
70 | 68 | ||
71 | static inline int migrate_prep(void) { return -ENOSYS; } | 69 | static inline int migrate_prep(void) { return -ENOSYS; } |
72 | static inline int migrate_prep_local(void) { return -ENOSYS; } | 70 | static inline int migrate_prep_local(void) { return -ENOSYS; } |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 9d9dcc35d6a1..e7c3f9a0111a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -87,6 +87,7 @@ extern unsigned int kobjsize(const void *objp); | |||
87 | #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ | 87 | #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ |
88 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ | 88 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ |
89 | 89 | ||
90 | #define VM_POPULATE 0x00001000 | ||
90 | #define VM_LOCKED 0x00002000 | 91 | #define VM_LOCKED 0x00002000 |
91 | #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ | 92 | #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ |
92 | 93 | ||
@@ -366,7 +367,7 @@ static inline struct page *compound_head(struct page *page) | |||
366 | * both from it and to it can be tracked, using atomic_inc_and_test | 367 | * both from it and to it can be tracked, using atomic_inc_and_test |
367 | * and atomic_add_negative(-1). | 368 | * and atomic_add_negative(-1). |
368 | */ | 369 | */ |
369 | static inline void reset_page_mapcount(struct page *page) | 370 | static inline void page_mapcount_reset(struct page *page) |
370 | { | 371 | { |
371 | atomic_set(&(page)->_mapcount, -1); | 372 | atomic_set(&(page)->_mapcount, -1); |
372 | } | 373 | } |
@@ -580,50 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
580 | * sets it, so none of the operations on it need to be atomic. | 581 | * sets it, so none of the operations on it need to be atomic. |
581 | */ | 582 | */ |
582 | 583 | ||
583 | 584 | /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */ | |
584 | /* | ||
585 | * page->flags layout: | ||
586 | * | ||
587 | * There are three possibilities for how page->flags get | ||
588 | * laid out. The first is for the normal case, without | ||
589 | * sparsemem. The second is for sparsemem when there is | ||
590 | * plenty of space for node and section. The last is when | ||
591 | * we have run out of space and have to fall back to an | ||
592 | * alternate (slower) way of determining the node. | ||
593 | * | ||
594 | * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | | ||
595 | * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | | ||
596 | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | | ||
597 | */ | ||
598 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | ||
599 | #define SECTIONS_WIDTH SECTIONS_SHIFT | ||
600 | #else | ||
601 | #define SECTIONS_WIDTH 0 | ||
602 | #endif | ||
603 | |||
604 | #define ZONES_WIDTH ZONES_SHIFT | ||
605 | |||
606 | #if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS | ||
607 | #define NODES_WIDTH NODES_SHIFT | ||
608 | #else | ||
609 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | ||
610 | #error "Vmemmap: No space for nodes field in page flags" | ||
611 | #endif | ||
612 | #define NODES_WIDTH 0 | ||
613 | #endif | ||
614 | |||
615 | /* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */ | ||
616 | #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) | 585 | #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) |
617 | #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) | 586 | #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) |
618 | #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) | 587 | #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) |
619 | 588 | #define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH) | |
620 | /* | ||
621 | * We are going to use the flags for the page to node mapping if its in | ||
622 | * there. This includes the case where there is no node, so it is implicit. | ||
623 | */ | ||
624 | #if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) | ||
625 | #define NODE_NOT_IN_PAGE_FLAGS | ||
626 | #endif | ||
627 | 589 | ||
628 | /* | 590 | /* |
629 | * Define the bit shifts to access each section. For non-existent | 591 | * Define the bit shifts to access each section. For non-existent |
@@ -633,6 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
633 | #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) | 595 | #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) |
634 | #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) | 596 | #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) |
635 | #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) | 597 | #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) |
598 | #define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0)) | ||
636 | 599 | ||
637 | /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ | 600 | /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ |
638 | #ifdef NODE_NOT_IN_PAGE_FLAGS | 601 | #ifdef NODE_NOT_IN_PAGE_FLAGS |
@@ -654,6 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
654 | #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) | 617 | #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) |
655 | #define NODES_MASK ((1UL << NODES_WIDTH) - 1) | 618 | #define NODES_MASK ((1UL << NODES_WIDTH) - 1) |
656 | #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) | 619 | #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) |
620 | #define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1) | ||
657 | #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) | 621 | #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) |
658 | 622 | ||
659 | static inline enum zone_type page_zonenum(const struct page *page) | 623 | static inline enum zone_type page_zonenum(const struct page *page) |
@@ -661,6 +625,10 @@ static inline enum zone_type page_zonenum(const struct page *page) | |||
661 | return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; | 625 | return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; |
662 | } | 626 | } |
663 | 627 | ||
628 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | ||
629 | #define SECTION_IN_PAGE_FLAGS | ||
630 | #endif | ||
631 | |||
664 | /* | 632 | /* |
665 | * The identification function is only used by the buddy allocator for | 633 | * The identification function is only used by the buddy allocator for |
666 | * determining if two pages could be buddies. We are not really | 634 | * determining if two pages could be buddies. We are not really |
@@ -693,31 +661,48 @@ static inline int page_to_nid(const struct page *page) | |||
693 | #endif | 661 | #endif |
694 | 662 | ||
695 | #ifdef CONFIG_NUMA_BALANCING | 663 | #ifdef CONFIG_NUMA_BALANCING |
696 | static inline int page_xchg_last_nid(struct page *page, int nid) | 664 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS |
665 | static inline int page_nid_xchg_last(struct page *page, int nid) | ||
697 | { | 666 | { |
698 | return xchg(&page->_last_nid, nid); | 667 | return xchg(&page->_last_nid, nid); |
699 | } | 668 | } |
700 | 669 | ||
701 | static inline int page_last_nid(struct page *page) | 670 | static inline int page_nid_last(struct page *page) |
702 | { | 671 | { |
703 | return page->_last_nid; | 672 | return page->_last_nid; |
704 | } | 673 | } |
705 | static inline void reset_page_last_nid(struct page *page) | 674 | static inline void page_nid_reset_last(struct page *page) |
706 | { | 675 | { |
707 | page->_last_nid = -1; | 676 | page->_last_nid = -1; |
708 | } | 677 | } |
709 | #else | 678 | #else |
710 | static inline int page_xchg_last_nid(struct page *page, int nid) | 679 | static inline int page_nid_last(struct page *page) |
680 | { | ||
681 | return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; | ||
682 | } | ||
683 | |||
684 | extern int page_nid_xchg_last(struct page *page, int nid); | ||
685 | |||
686 | static inline void page_nid_reset_last(struct page *page) | ||
687 | { | ||
688 | int nid = (1 << LAST_NID_SHIFT) - 1; | ||
689 | |||
690 | page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); | ||
691 | page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; | ||
692 | } | ||
693 | #endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ | ||
694 | #else | ||
695 | static inline int page_nid_xchg_last(struct page *page, int nid) | ||
711 | { | 696 | { |
712 | return page_to_nid(page); | 697 | return page_to_nid(page); |
713 | } | 698 | } |
714 | 699 | ||
715 | static inline int page_last_nid(struct page *page) | 700 | static inline int page_nid_last(struct page *page) |
716 | { | 701 | { |
717 | return page_to_nid(page); | 702 | return page_to_nid(page); |
718 | } | 703 | } |
719 | 704 | ||
720 | static inline void reset_page_last_nid(struct page *page) | 705 | static inline void page_nid_reset_last(struct page *page) |
721 | { | 706 | { |
722 | } | 707 | } |
723 | #endif | 708 | #endif |
@@ -727,7 +712,7 @@ static inline struct zone *page_zone(const struct page *page) | |||
727 | return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; | 712 | return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; |
728 | } | 713 | } |
729 | 714 | ||
730 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | 715 | #ifdef SECTION_IN_PAGE_FLAGS |
731 | static inline void set_page_section(struct page *page, unsigned long section) | 716 | static inline void set_page_section(struct page *page, unsigned long section) |
732 | { | 717 | { |
733 | page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); | 718 | page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); |
@@ -757,7 +742,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone, | |||
757 | { | 742 | { |
758 | set_page_zone(page, zone); | 743 | set_page_zone(page, zone); |
759 | set_page_node(page, node); | 744 | set_page_node(page, node); |
760 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | 745 | #ifdef SECTION_IN_PAGE_FLAGS |
761 | set_page_section(page, pfn_to_section_nr(pfn)); | 746 | set_page_section(page, pfn_to_section_nr(pfn)); |
762 | #endif | 747 | #endif |
763 | } | 748 | } |
@@ -817,18 +802,7 @@ void page_address_init(void); | |||
817 | #define PAGE_MAPPING_KSM 2 | 802 | #define PAGE_MAPPING_KSM 2 |
818 | #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) | 803 | #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) |
819 | 804 | ||
820 | extern struct address_space swapper_space; | 805 | extern struct address_space *page_mapping(struct page *page); |
821 | static inline struct address_space *page_mapping(struct page *page) | ||
822 | { | ||
823 | struct address_space *mapping = page->mapping; | ||
824 | |||
825 | VM_BUG_ON(PageSlab(page)); | ||
826 | if (unlikely(PageSwapCache(page))) | ||
827 | mapping = &swapper_space; | ||
828 | else if ((unsigned long)mapping & PAGE_MAPPING_ANON) | ||
829 | mapping = NULL; | ||
830 | return mapping; | ||
831 | } | ||
832 | 806 | ||
833 | /* Neutral page->mapping pointer to address_space or anon_vma or other */ | 807 | /* Neutral page->mapping pointer to address_space or anon_vma or other */ |
834 | static inline void *page_rmapping(struct page *page) | 808 | static inline void *page_rmapping(struct page *page) |
@@ -1035,18 +1009,18 @@ static inline int fixup_user_fault(struct task_struct *tsk, | |||
1035 | } | 1009 | } |
1036 | #endif | 1010 | #endif |
1037 | 1011 | ||
1038 | extern int make_pages_present(unsigned long addr, unsigned long end); | ||
1039 | extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); | 1012 | extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); |
1040 | extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, | 1013 | extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, |
1041 | void *buf, int len, int write); | 1014 | void *buf, int len, int write); |
1042 | 1015 | ||
1043 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1016 | long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1044 | unsigned long start, int len, unsigned int foll_flags, | 1017 | unsigned long start, unsigned long nr_pages, |
1045 | struct page **pages, struct vm_area_struct **vmas, | 1018 | unsigned int foll_flags, struct page **pages, |
1046 | int *nonblocking); | 1019 | struct vm_area_struct **vmas, int *nonblocking); |
1047 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1020 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1048 | unsigned long start, int nr_pages, int write, int force, | 1021 | unsigned long start, unsigned long nr_pages, |
1049 | struct page **pages, struct vm_area_struct **vmas); | 1022 | int write, int force, struct page **pages, |
1023 | struct vm_area_struct **vmas); | ||
1050 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | 1024 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, |
1051 | struct page **pages); | 1025 | struct page **pages); |
1052 | struct kvec; | 1026 | struct kvec; |
@@ -1359,6 +1333,24 @@ extern void free_bootmem_with_active_regions(int nid, | |||
1359 | unsigned long max_low_pfn); | 1333 | unsigned long max_low_pfn); |
1360 | extern void sparse_memory_present_with_active_regions(int nid); | 1334 | extern void sparse_memory_present_with_active_regions(int nid); |
1361 | 1335 | ||
1336 | #define MOVABLEMEM_MAP_MAX MAX_NUMNODES | ||
1337 | struct movablemem_entry { | ||
1338 | unsigned long start_pfn; /* start pfn of memory segment */ | ||
1339 | unsigned long end_pfn; /* end pfn of memory segment (exclusive) */ | ||
1340 | }; | ||
1341 | |||
1342 | struct movablemem_map { | ||
1343 | bool acpi; /* true if using SRAT info */ | ||
1344 | int nr_map; | ||
1345 | struct movablemem_entry map[MOVABLEMEM_MAP_MAX]; | ||
1346 | nodemask_t numa_nodes_hotplug; /* on which nodes we specify memory */ | ||
1347 | nodemask_t numa_nodes_kernel; /* on which nodes kernel resides in */ | ||
1348 | }; | ||
1349 | |||
1350 | extern void __init insert_movablemem_map(unsigned long start_pfn, | ||
1351 | unsigned long end_pfn); | ||
1352 | extern int __init movablemem_map_overlap(unsigned long start_pfn, | ||
1353 | unsigned long end_pfn); | ||
1362 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 1354 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
1363 | 1355 | ||
1364 | #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ | 1356 | #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ |
@@ -1395,6 +1387,9 @@ extern void setup_per_cpu_pageset(void); | |||
1395 | extern void zone_pcp_update(struct zone *zone); | 1387 | extern void zone_pcp_update(struct zone *zone); |
1396 | extern void zone_pcp_reset(struct zone *zone); | 1388 | extern void zone_pcp_reset(struct zone *zone); |
1397 | 1389 | ||
1390 | /* page_alloc.c */ | ||
1391 | extern int min_free_kbytes; | ||
1392 | |||
1398 | /* nommu.c */ | 1393 | /* nommu.c */ |
1399 | extern atomic_long_t mmap_pages_allocated; | 1394 | extern atomic_long_t mmap_pages_allocated; |
1400 | extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); | 1395 | extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); |
@@ -1472,13 +1467,24 @@ extern int install_special_mapping(struct mm_struct *mm, | |||
1472 | extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); | 1467 | extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); |
1473 | 1468 | ||
1474 | extern unsigned long mmap_region(struct file *file, unsigned long addr, | 1469 | extern unsigned long mmap_region(struct file *file, unsigned long addr, |
1475 | unsigned long len, unsigned long flags, | 1470 | unsigned long len, vm_flags_t vm_flags, unsigned long pgoff); |
1476 | vm_flags_t vm_flags, unsigned long pgoff); | 1471 | extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
1477 | extern unsigned long do_mmap_pgoff(struct file *, unsigned long, | 1472 | unsigned long len, unsigned long prot, unsigned long flags, |
1478 | unsigned long, unsigned long, | 1473 | unsigned long pgoff, unsigned long *populate); |
1479 | unsigned long, unsigned long); | ||
1480 | extern int do_munmap(struct mm_struct *, unsigned long, size_t); | 1474 | extern int do_munmap(struct mm_struct *, unsigned long, size_t); |
1481 | 1475 | ||
1476 | #ifdef CONFIG_MMU | ||
1477 | extern int __mm_populate(unsigned long addr, unsigned long len, | ||
1478 | int ignore_errors); | ||
1479 | static inline void mm_populate(unsigned long addr, unsigned long len) | ||
1480 | { | ||
1481 | /* Ignore errors */ | ||
1482 | (void) __mm_populate(addr, len, 1); | ||
1483 | } | ||
1484 | #else | ||
1485 | static inline void mm_populate(unsigned long addr, unsigned long len) {} | ||
1486 | #endif | ||
1487 | |||
1482 | /* These take the mm semaphore themselves */ | 1488 | /* These take the mm semaphore themselves */ |
1483 | extern unsigned long vm_brk(unsigned long, unsigned long); | 1489 | extern unsigned long vm_brk(unsigned long, unsigned long); |
1484 | extern int vm_munmap(unsigned long, size_t); | 1490 | extern int vm_munmap(unsigned long, size_t); |
@@ -1623,8 +1629,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
1623 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | 1629 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, |
1624 | unsigned long pfn); | 1630 | unsigned long pfn); |
1625 | 1631 | ||
1626 | struct page *follow_page(struct vm_area_struct *, unsigned long address, | 1632 | struct page *follow_page_mask(struct vm_area_struct *vma, |
1627 | unsigned int foll_flags); | 1633 | unsigned long address, unsigned int foll_flags, |
1634 | unsigned int *page_mask); | ||
1635 | |||
1636 | static inline struct page *follow_page(struct vm_area_struct *vma, | ||
1637 | unsigned long address, unsigned int foll_flags) | ||
1638 | { | ||
1639 | unsigned int unused_page_mask; | ||
1640 | return follow_page_mask(vma, address, foll_flags, &unused_page_mask); | ||
1641 | } | ||
1642 | |||
1628 | #define FOLL_WRITE 0x01 /* check pte is writable */ | 1643 | #define FOLL_WRITE 0x01 /* check pte is writable */ |
1629 | #define FOLL_TOUCH 0x02 /* mark page accessed */ | 1644 | #define FOLL_TOUCH 0x02 /* mark page accessed */ |
1630 | #define FOLL_GET 0x04 /* do get_page on page */ | 1645 | #define FOLL_GET 0x04 /* do get_page on page */ |
@@ -1636,6 +1651,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, | |||
1636 | #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ | 1651 | #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ |
1637 | #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ | 1652 | #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ |
1638 | #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ | 1653 | #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ |
1654 | #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ | ||
1639 | 1655 | ||
1640 | typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, | 1656 | typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, |
1641 | void *data); | 1657 | void *data); |
@@ -1707,7 +1723,11 @@ int vmemmap_populate_basepages(struct page *start_page, | |||
1707 | unsigned long pages, int node); | 1723 | unsigned long pages, int node); |
1708 | int vmemmap_populate(struct page *start_page, unsigned long pages, int node); | 1724 | int vmemmap_populate(struct page *start_page, unsigned long pages, int node); |
1709 | void vmemmap_populate_print_last(void); | 1725 | void vmemmap_populate_print_last(void); |
1710 | 1726 | #ifdef CONFIG_MEMORY_HOTPLUG | |
1727 | void vmemmap_free(struct page *memmap, unsigned long nr_pages); | ||
1728 | #endif | ||
1729 | void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, | ||
1730 | unsigned long size); | ||
1711 | 1731 | ||
1712 | enum mf_flags { | 1732 | enum mf_flags { |
1713 | MF_COUNT_INCREASED = 1 << 0, | 1733 | MF_COUNT_INCREASED = 1 << 0, |
@@ -1720,7 +1740,7 @@ extern int unpoison_memory(unsigned long pfn); | |||
1720 | extern int sysctl_memory_failure_early_kill; | 1740 | extern int sysctl_memory_failure_early_kill; |
1721 | extern int sysctl_memory_failure_recovery; | 1741 | extern int sysctl_memory_failure_recovery; |
1722 | extern void shake_page(struct page *p, int access); | 1742 | extern void shake_page(struct page *p, int access); |
1723 | extern atomic_long_t mce_bad_pages; | 1743 | extern atomic_long_t num_poisoned_pages; |
1724 | extern int soft_offline_page(struct page *page, int flags); | 1744 | extern int soft_offline_page(struct page *page, int flags); |
1725 | 1745 | ||
1726 | extern void dump_page(struct page *page); | 1746 | extern void dump_page(struct page *page); |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f8f5162a3571..ace9a5f01c64 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/cpumask.h> | 12 | #include <linux/cpumask.h> |
13 | #include <linux/page-debug-flags.h> | 13 | #include <linux/page-debug-flags.h> |
14 | #include <linux/uprobes.h> | 14 | #include <linux/uprobes.h> |
15 | #include <linux/page-flags-layout.h> | ||
15 | #include <asm/page.h> | 16 | #include <asm/page.h> |
16 | #include <asm/mmu.h> | 17 | #include <asm/mmu.h> |
17 | 18 | ||
@@ -173,7 +174,7 @@ struct page { | |||
173 | void *shadow; | 174 | void *shadow; |
174 | #endif | 175 | #endif |
175 | 176 | ||
176 | #ifdef CONFIG_NUMA_BALANCING | 177 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS |
177 | int _last_nid; | 178 | int _last_nid; |
178 | #endif | 179 | #endif |
179 | } | 180 | } |
@@ -414,9 +415,9 @@ struct mm_struct { | |||
414 | #endif | 415 | #endif |
415 | #ifdef CONFIG_NUMA_BALANCING | 416 | #ifdef CONFIG_NUMA_BALANCING |
416 | /* | 417 | /* |
417 | * numa_next_scan is the next time when the PTEs will me marked | 418 | * numa_next_scan is the next time that the PTEs will be marked |
418 | * pte_numa to gather statistics and migrate pages to new nodes | 419 | * pte_numa. NUMA hinting faults will gather statistics and migrate |
419 | * if necessary | 420 | * pages to new nodes if necessary. |
420 | */ | 421 | */ |
421 | unsigned long numa_next_scan; | 422 | unsigned long numa_next_scan; |
422 | 423 | ||
diff --git a/include/linux/mman.h b/include/linux/mman.h index 9aa863da287f..61c7a87e5d2b 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h | |||
@@ -79,6 +79,8 @@ calc_vm_flag_bits(unsigned long flags) | |||
79 | { | 79 | { |
80 | return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | | 80 | return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | |
81 | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | | 81 | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | |
82 | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); | 82 | ((flags & MAP_LOCKED) ? (VM_LOCKED | VM_POPULATE) : 0) | |
83 | (((flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE) ? | ||
84 | VM_POPULATE : 0); | ||
83 | } | 85 | } |
84 | #endif /* _LINUX_MMAN_H */ | 86 | #endif /* _LINUX_MMAN_H */ |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 73b64a38b984..ede274957e05 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -15,7 +15,7 @@ | |||
15 | #include <linux/seqlock.h> | 15 | #include <linux/seqlock.h> |
16 | #include <linux/nodemask.h> | 16 | #include <linux/nodemask.h> |
17 | #include <linux/pageblock-flags.h> | 17 | #include <linux/pageblock-flags.h> |
18 | #include <generated/bounds.h> | 18 | #include <linux/page-flags-layout.h> |
19 | #include <linux/atomic.h> | 19 | #include <linux/atomic.h> |
20 | #include <asm/page.h> | 20 | #include <asm/page.h> |
21 | 21 | ||
@@ -57,7 +57,9 @@ enum { | |||
57 | */ | 57 | */ |
58 | MIGRATE_CMA, | 58 | MIGRATE_CMA, |
59 | #endif | 59 | #endif |
60 | #ifdef CONFIG_MEMORY_ISOLATION | ||
60 | MIGRATE_ISOLATE, /* can't allocate from here */ | 61 | MIGRATE_ISOLATE, /* can't allocate from here */ |
62 | #endif | ||
61 | MIGRATE_TYPES | 63 | MIGRATE_TYPES |
62 | }; | 64 | }; |
63 | 65 | ||
@@ -308,24 +310,6 @@ enum zone_type { | |||
308 | 310 | ||
309 | #ifndef __GENERATING_BOUNDS_H | 311 | #ifndef __GENERATING_BOUNDS_H |
310 | 312 | ||
311 | /* | ||
312 | * When a memory allocation must conform to specific limitations (such | ||
313 | * as being suitable for DMA) the caller will pass in hints to the | ||
314 | * allocator in the gfp_mask, in the zone modifier bits. These bits | ||
315 | * are used to select a priority ordered list of memory zones which | ||
316 | * match the requested limits. See gfp_zone() in include/linux/gfp.h | ||
317 | */ | ||
318 | |||
319 | #if MAX_NR_ZONES < 2 | ||
320 | #define ZONES_SHIFT 0 | ||
321 | #elif MAX_NR_ZONES <= 2 | ||
322 | #define ZONES_SHIFT 1 | ||
323 | #elif MAX_NR_ZONES <= 4 | ||
324 | #define ZONES_SHIFT 2 | ||
325 | #else | ||
326 | #error ZONES_SHIFT -- too many zones configured adjust calculation | ||
327 | #endif | ||
328 | |||
329 | struct zone { | 313 | struct zone { |
330 | /* Fields commonly accessed by the page allocator */ | 314 | /* Fields commonly accessed by the page allocator */ |
331 | 315 | ||
@@ -543,6 +527,26 @@ static inline int zone_is_oom_locked(const struct zone *zone) | |||
543 | return test_bit(ZONE_OOM_LOCKED, &zone->flags); | 527 | return test_bit(ZONE_OOM_LOCKED, &zone->flags); |
544 | } | 528 | } |
545 | 529 | ||
530 | static inline unsigned zone_end_pfn(const struct zone *zone) | ||
531 | { | ||
532 | return zone->zone_start_pfn + zone->spanned_pages; | ||
533 | } | ||
534 | |||
535 | static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) | ||
536 | { | ||
537 | return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); | ||
538 | } | ||
539 | |||
540 | static inline bool zone_is_initialized(struct zone *zone) | ||
541 | { | ||
542 | return !!zone->wait_table; | ||
543 | } | ||
544 | |||
545 | static inline bool zone_is_empty(struct zone *zone) | ||
546 | { | ||
547 | return zone->spanned_pages == 0; | ||
548 | } | ||
549 | |||
546 | /* | 550 | /* |
547 | * The "priority" of VM scanning is how much of the queues we will scan in one | 551 | * The "priority" of VM scanning is how much of the queues we will scan in one |
548 | * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the | 552 | * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the |
@@ -752,11 +756,17 @@ typedef struct pglist_data { | |||
752 | #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) | 756 | #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) |
753 | 757 | ||
754 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) | 758 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) |
759 | #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) | ||
755 | 760 | ||
756 | #define node_end_pfn(nid) ({\ | 761 | static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) |
757 | pg_data_t *__pgdat = NODE_DATA(nid);\ | 762 | { |
758 | __pgdat->node_start_pfn + __pgdat->node_spanned_pages;\ | 763 | return pgdat->node_start_pfn + pgdat->node_spanned_pages; |
759 | }) | 764 | } |
765 | |||
766 | static inline bool pgdat_is_empty(pg_data_t *pgdat) | ||
767 | { | ||
768 | return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; | ||
769 | } | ||
760 | 770 | ||
761 | #include <linux/memory_hotplug.h> | 771 | #include <linux/memory_hotplug.h> |
762 | 772 | ||
@@ -1053,8 +1063,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn) | |||
1053 | * PA_SECTION_SHIFT physical address to/from section number | 1063 | * PA_SECTION_SHIFT physical address to/from section number |
1054 | * PFN_SECTION_SHIFT pfn to/from section number | 1064 | * PFN_SECTION_SHIFT pfn to/from section number |
1055 | */ | 1065 | */ |
1056 | #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) | ||
1057 | |||
1058 | #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) | 1066 | #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) |
1059 | #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) | 1067 | #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) |
1060 | 1068 | ||
diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h new file mode 100644 index 000000000000..93506a114034 --- /dev/null +++ b/include/linux/page-flags-layout.h | |||
@@ -0,0 +1,88 @@ | |||
1 | #ifndef PAGE_FLAGS_LAYOUT_H | ||
2 | #define PAGE_FLAGS_LAYOUT_H | ||
3 | |||
4 | #include <linux/numa.h> | ||
5 | #include <generated/bounds.h> | ||
6 | |||
7 | /* | ||
8 | * When a memory allocation must conform to specific limitations (such | ||
9 | * as being suitable for DMA) the caller will pass in hints to the | ||
10 | * allocator in the gfp_mask, in the zone modifier bits. These bits | ||
11 | * are used to select a priority ordered list of memory zones which | ||
12 | * match the requested limits. See gfp_zone() in include/linux/gfp.h | ||
13 | */ | ||
14 | #if MAX_NR_ZONES < 2 | ||
15 | #define ZONES_SHIFT 0 | ||
16 | #elif MAX_NR_ZONES <= 2 | ||
17 | #define ZONES_SHIFT 1 | ||
18 | #elif MAX_NR_ZONES <= 4 | ||
19 | #define ZONES_SHIFT 2 | ||
20 | #else | ||
21 | #error ZONES_SHIFT -- too many zones configured adjust calculation | ||
22 | #endif | ||
23 | |||
24 | #ifdef CONFIG_SPARSEMEM | ||
25 | #include <asm/sparsemem.h> | ||
26 | |||
27 | /* SECTION_SHIFT #bits space required to store a section # */ | ||
28 | #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) | ||
29 | |||
30 | #endif /* CONFIG_SPARSEMEM */ | ||
31 | |||
32 | /* | ||
33 | * page->flags layout: | ||
34 | * | ||
35 | * There are five possibilities for how page->flags get laid out. The first | ||
36 | * pair is for the normal case without sparsemem. The second pair is for | ||
37 | * sparsemem when there is plenty of space for node and section information. | ||
38 | * The last is when there is insufficient space in page->flags and a separate | ||
39 | * lookup is necessary. | ||
40 | * | ||
41 | * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | | ||
42 | * " plus space for last_nid: | NODE | ZONE | LAST_NID ... | FLAGS | | ||
43 | * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | | ||
44 | * " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS | | ||
45 | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | | ||
46 | */ | ||
47 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | ||
48 | #define SECTIONS_WIDTH SECTIONS_SHIFT | ||
49 | #else | ||
50 | #define SECTIONS_WIDTH 0 | ||
51 | #endif | ||
52 | |||
53 | #define ZONES_WIDTH ZONES_SHIFT | ||
54 | |||
55 | #if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS | ||
56 | #define NODES_WIDTH NODES_SHIFT | ||
57 | #else | ||
58 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | ||
59 | #error "Vmemmap: No space for nodes field in page flags" | ||
60 | #endif | ||
61 | #define NODES_WIDTH 0 | ||
62 | #endif | ||
63 | |||
64 | #ifdef CONFIG_NUMA_BALANCING | ||
65 | #define LAST_NID_SHIFT NODES_SHIFT | ||
66 | #else | ||
67 | #define LAST_NID_SHIFT 0 | ||
68 | #endif | ||
69 | |||
70 | #if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS | ||
71 | #define LAST_NID_WIDTH LAST_NID_SHIFT | ||
72 | #else | ||
73 | #define LAST_NID_WIDTH 0 | ||
74 | #endif | ||
75 | |||
76 | /* | ||
77 | * We are going to use the flags for the page to node mapping if its in | ||
78 | * there. This includes the case where there is no node, so it is implicit. | ||
79 | */ | ||
80 | #if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) | ||
81 | #define NODE_NOT_IN_PAGE_FLAGS | ||
82 | #endif | ||
83 | |||
84 | #if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0 | ||
85 | #define LAST_NID_NOT_IN_PAGE_FLAGS | ||
86 | #endif | ||
87 | |||
88 | #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ | ||
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index a92061e08d48..3fff8e774067 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h | |||
@@ -1,6 +1,25 @@ | |||
1 | #ifndef __LINUX_PAGEISOLATION_H | 1 | #ifndef __LINUX_PAGEISOLATION_H |
2 | #define __LINUX_PAGEISOLATION_H | 2 | #define __LINUX_PAGEISOLATION_H |
3 | 3 | ||
4 | #ifdef CONFIG_MEMORY_ISOLATION | ||
5 | static inline bool is_migrate_isolate_page(struct page *page) | ||
6 | { | ||
7 | return get_pageblock_migratetype(page) == MIGRATE_ISOLATE; | ||
8 | } | ||
9 | static inline bool is_migrate_isolate(int migratetype) | ||
10 | { | ||
11 | return migratetype == MIGRATE_ISOLATE; | ||
12 | } | ||
13 | #else | ||
14 | static inline bool is_migrate_isolate_page(struct page *page) | ||
15 | { | ||
16 | return false; | ||
17 | } | ||
18 | static inline bool is_migrate_isolate(int migratetype) | ||
19 | { | ||
20 | return false; | ||
21 | } | ||
22 | #endif | ||
4 | 23 | ||
5 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | 24 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, |
6 | bool skip_hwpoisoned_pages); | 25 | bool skip_hwpoisoned_pages); |
diff --git a/include/linux/pm.h b/include/linux/pm.h index 97bcf23e045a..e5d7230332a4 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h | |||
@@ -537,6 +537,7 @@ struct dev_pm_info { | |||
537 | unsigned int irq_safe:1; | 537 | unsigned int irq_safe:1; |
538 | unsigned int use_autosuspend:1; | 538 | unsigned int use_autosuspend:1; |
539 | unsigned int timer_autosuspends:1; | 539 | unsigned int timer_autosuspends:1; |
540 | unsigned int memalloc_noio:1; | ||
540 | enum rpm_request request; | 541 | enum rpm_request request; |
541 | enum rpm_status runtime_status; | 542 | enum rpm_status runtime_status; |
542 | int runtime_error; | 543 | int runtime_error; |
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index c785c215abfc..7d7e09efff9b 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h | |||
@@ -47,6 +47,7 @@ extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay); | |||
47 | extern unsigned long pm_runtime_autosuspend_expiration(struct device *dev); | 47 | extern unsigned long pm_runtime_autosuspend_expiration(struct device *dev); |
48 | extern void pm_runtime_update_max_time_suspended(struct device *dev, | 48 | extern void pm_runtime_update_max_time_suspended(struct device *dev, |
49 | s64 delta_ns); | 49 | s64 delta_ns); |
50 | extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable); | ||
50 | 51 | ||
51 | static inline bool pm_children_suspended(struct device *dev) | 52 | static inline bool pm_children_suspended(struct device *dev) |
52 | { | 53 | { |
@@ -156,6 +157,8 @@ static inline void pm_runtime_set_autosuspend_delay(struct device *dev, | |||
156 | int delay) {} | 157 | int delay) {} |
157 | static inline unsigned long pm_runtime_autosuspend_expiration( | 158 | static inline unsigned long pm_runtime_autosuspend_expiration( |
158 | struct device *dev) { return 0; } | 159 | struct device *dev) { return 0; } |
160 | static inline void pm_runtime_set_memalloc_noio(struct device *dev, | ||
161 | bool enable){} | ||
159 | 162 | ||
160 | #endif /* !CONFIG_PM_RUNTIME */ | 163 | #endif /* !CONFIG_PM_RUNTIME */ |
161 | 164 | ||
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c20635c527a9..6dacb93a6d94 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
@@ -123,7 +123,7 @@ static inline void anon_vma_lock_write(struct anon_vma *anon_vma) | |||
123 | down_write(&anon_vma->root->rwsem); | 123 | down_write(&anon_vma->root->rwsem); |
124 | } | 124 | } |
125 | 125 | ||
126 | static inline void anon_vma_unlock(struct anon_vma *anon_vma) | 126 | static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) |
127 | { | 127 | { |
128 | up_write(&anon_vma->root->rwsem); | 128 | up_write(&anon_vma->root->rwsem); |
129 | } | 129 | } |
diff --git a/include/linux/sched.h b/include/linux/sched.h index e4112aad2964..c2182b53dace 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -51,6 +51,7 @@ struct sched_param { | |||
51 | #include <linux/cred.h> | 51 | #include <linux/cred.h> |
52 | #include <linux/llist.h> | 52 | #include <linux/llist.h> |
53 | #include <linux/uidgid.h> | 53 | #include <linux/uidgid.h> |
54 | #include <linux/gfp.h> | ||
54 | 55 | ||
55 | #include <asm/processor.h> | 56 | #include <asm/processor.h> |
56 | 57 | ||
@@ -1791,6 +1792,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, | |||
1791 | #define PF_FROZEN 0x00010000 /* frozen for system suspend */ | 1792 | #define PF_FROZEN 0x00010000 /* frozen for system suspend */ |
1792 | #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ | 1793 | #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ |
1793 | #define PF_KSWAPD 0x00040000 /* I am kswapd */ | 1794 | #define PF_KSWAPD 0x00040000 /* I am kswapd */ |
1795 | #define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */ | ||
1794 | #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ | 1796 | #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ |
1795 | #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ | 1797 | #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ |
1796 | #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ | 1798 | #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ |
@@ -1828,6 +1830,26 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, | |||
1828 | #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) | 1830 | #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) |
1829 | #define used_math() tsk_used_math(current) | 1831 | #define used_math() tsk_used_math(current) |
1830 | 1832 | ||
1833 | /* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */ | ||
1834 | static inline gfp_t memalloc_noio_flags(gfp_t flags) | ||
1835 | { | ||
1836 | if (unlikely(current->flags & PF_MEMALLOC_NOIO)) | ||
1837 | flags &= ~__GFP_IO; | ||
1838 | return flags; | ||
1839 | } | ||
1840 | |||
1841 | static inline unsigned int memalloc_noio_save(void) | ||
1842 | { | ||
1843 | unsigned int flags = current->flags & PF_MEMALLOC_NOIO; | ||
1844 | current->flags |= PF_MEMALLOC_NOIO; | ||
1845 | return flags; | ||
1846 | } | ||
1847 | |||
1848 | static inline void memalloc_noio_restore(unsigned int flags) | ||
1849 | { | ||
1850 | current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; | ||
1851 | } | ||
1852 | |||
1831 | /* | 1853 | /* |
1832 | * task->jobctl flags | 1854 | * task->jobctl flags |
1833 | */ | 1855 | */ |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 68df9c17fbbb..2818a123f3ea 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -8,7 +8,7 @@ | |||
8 | #include <linux/memcontrol.h> | 8 | #include <linux/memcontrol.h> |
9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
10 | #include <linux/node.h> | 10 | #include <linux/node.h> |
11 | 11 | #include <linux/fs.h> | |
12 | #include <linux/atomic.h> | 12 | #include <linux/atomic.h> |
13 | #include <asm/page.h> | 13 | #include <asm/page.h> |
14 | 14 | ||
@@ -156,7 +156,7 @@ enum { | |||
156 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ | 156 | SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ |
157 | }; | 157 | }; |
158 | 158 | ||
159 | #define SWAP_CLUSTER_MAX 32 | 159 | #define SWAP_CLUSTER_MAX 32UL |
160 | #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX | 160 | #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX |
161 | 161 | ||
162 | /* | 162 | /* |
@@ -202,6 +202,18 @@ struct swap_info_struct { | |||
202 | unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ | 202 | unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ |
203 | atomic_t frontswap_pages; /* frontswap pages in-use counter */ | 203 | atomic_t frontswap_pages; /* frontswap pages in-use counter */ |
204 | #endif | 204 | #endif |
205 | spinlock_t lock; /* | ||
206 | * protect map scan related fields like | ||
207 | * swap_map, lowest_bit, highest_bit, | ||
208 | * inuse_pages, cluster_next, | ||
209 | * cluster_nr, lowest_alloc and | ||
210 | * highest_alloc. other fields are only | ||
211 | * changed at swapon/swapoff, so are | ||
212 | * protected by swap_lock. changing | ||
213 | * flags need hold this lock and | ||
214 | * swap_lock. If both locks need hold, | ||
215 | * hold swap_lock first. | ||
216 | */ | ||
205 | }; | 217 | }; |
206 | 218 | ||
207 | struct swap_list_t { | 219 | struct swap_list_t { |
@@ -209,15 +221,12 @@ struct swap_list_t { | |||
209 | int next; /* swapfile to be used next */ | 221 | int next; /* swapfile to be used next */ |
210 | }; | 222 | }; |
211 | 223 | ||
212 | /* Swap 50% full? Release swapcache more aggressively.. */ | ||
213 | #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) | ||
214 | |||
215 | /* linux/mm/page_alloc.c */ | 224 | /* linux/mm/page_alloc.c */ |
216 | extern unsigned long totalram_pages; | 225 | extern unsigned long totalram_pages; |
217 | extern unsigned long totalreserve_pages; | 226 | extern unsigned long totalreserve_pages; |
218 | extern unsigned long dirty_balance_reserve; | 227 | extern unsigned long dirty_balance_reserve; |
219 | extern unsigned int nr_free_buffer_pages(void); | 228 | extern unsigned long nr_free_buffer_pages(void); |
220 | extern unsigned int nr_free_pagecache_pages(void); | 229 | extern unsigned long nr_free_pagecache_pages(void); |
221 | 230 | ||
222 | /* Definition of global_page_state not available yet */ | 231 | /* Definition of global_page_state not available yet */ |
223 | #define nr_free_pages() global_page_state(NR_FREE_PAGES) | 232 | #define nr_free_pages() global_page_state(NR_FREE_PAGES) |
@@ -266,7 +275,7 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
266 | extern unsigned long shrink_all_memory(unsigned long nr_pages); | 275 | extern unsigned long shrink_all_memory(unsigned long nr_pages); |
267 | extern int vm_swappiness; | 276 | extern int vm_swappiness; |
268 | extern int remove_mapping(struct address_space *mapping, struct page *page); | 277 | extern int remove_mapping(struct address_space *mapping, struct page *page); |
269 | extern long vm_total_pages; | 278 | extern unsigned long vm_total_pages; |
270 | 279 | ||
271 | #ifdef CONFIG_NUMA | 280 | #ifdef CONFIG_NUMA |
272 | extern int zone_reclaim_mode; | 281 | extern int zone_reclaim_mode; |
@@ -330,8 +339,9 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *, | |||
330 | sector_t *); | 339 | sector_t *); |
331 | 340 | ||
332 | /* linux/mm/swap_state.c */ | 341 | /* linux/mm/swap_state.c */ |
333 | extern struct address_space swapper_space; | 342 | extern struct address_space swapper_spaces[]; |
334 | #define total_swapcache_pages swapper_space.nrpages | 343 | #define swap_address_space(entry) (&swapper_spaces[swp_type(entry)]) |
344 | extern unsigned long total_swapcache_pages(void); | ||
335 | extern void show_swap_cache_info(void); | 345 | extern void show_swap_cache_info(void); |
336 | extern int add_to_swap(struct page *); | 346 | extern int add_to_swap(struct page *); |
337 | extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); | 347 | extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); |
@@ -346,8 +356,20 @@ extern struct page *swapin_readahead(swp_entry_t, gfp_t, | |||
346 | struct vm_area_struct *vma, unsigned long addr); | 356 | struct vm_area_struct *vma, unsigned long addr); |
347 | 357 | ||
348 | /* linux/mm/swapfile.c */ | 358 | /* linux/mm/swapfile.c */ |
349 | extern long nr_swap_pages; | 359 | extern atomic_long_t nr_swap_pages; |
350 | extern long total_swap_pages; | 360 | extern long total_swap_pages; |
361 | |||
362 | /* Swap 50% full? Release swapcache more aggressively.. */ | ||
363 | static inline bool vm_swap_full(void) | ||
364 | { | ||
365 | return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages; | ||
366 | } | ||
367 | |||
368 | static inline long get_nr_swap_pages(void) | ||
369 | { | ||
370 | return atomic_long_read(&nr_swap_pages); | ||
371 | } | ||
372 | |||
351 | extern void si_swapinfo(struct sysinfo *); | 373 | extern void si_swapinfo(struct sysinfo *); |
352 | extern swp_entry_t get_swap_page(void); | 374 | extern swp_entry_t get_swap_page(void); |
353 | extern swp_entry_t get_swap_page_of_type(int); | 375 | extern swp_entry_t get_swap_page_of_type(int); |
@@ -380,9 +402,10 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
380 | 402 | ||
381 | #else /* CONFIG_SWAP */ | 403 | #else /* CONFIG_SWAP */ |
382 | 404 | ||
383 | #define nr_swap_pages 0L | 405 | #define get_nr_swap_pages() 0L |
384 | #define total_swap_pages 0L | 406 | #define total_swap_pages 0L |
385 | #define total_swapcache_pages 0UL | 407 | #define total_swapcache_pages() 0UL |
408 | #define vm_swap_full() 0 | ||
386 | 409 | ||
387 | #define si_swapinfo(val) \ | 410 | #define si_swapinfo(val) \ |
388 | do { (val)->freeswap = (val)->totalswap = 0; } while (0) | 411 | do { (val)->freeswap = (val)->totalswap = 0; } while (0) |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index fce0a2799d43..bd6cf61142be 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -36,7 +36,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
36 | #endif | 36 | #endif |
37 | PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, | 37 | PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL, |
38 | KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, | 38 | KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, |
39 | KSWAPD_SKIP_CONGESTION_WAIT, | ||
40 | PAGEOUTRUN, ALLOCSTALL, PGROTATED, | 39 | PAGEOUTRUN, ALLOCSTALL, PGROTATED, |
41 | #ifdef CONFIG_NUMA_BALANCING | 40 | #ifdef CONFIG_NUMA_BALANCING |
42 | NUMA_PTE_UPDATES, | 41 | NUMA_PTE_UPDATES, |
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index a13291f7da88..5fd71a7d0dfd 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h | |||
@@ -85,7 +85,7 @@ static inline void vm_events_fold_cpu(int cpu) | |||
85 | #define count_vm_numa_events(x, y) count_vm_events(x, y) | 85 | #define count_vm_numa_events(x, y) count_vm_events(x, y) |
86 | #else | 86 | #else |
87 | #define count_vm_numa_event(x) do {} while (0) | 87 | #define count_vm_numa_event(x) do {} while (0) |
88 | #define count_vm_numa_events(x, y) do {} while (0) | 88 | #define count_vm_numa_events(x, y) do { (void)(y); } while (0) |
89 | #endif /* CONFIG_NUMA_BALANCING */ | 89 | #endif /* CONFIG_NUMA_BALANCING */ |
90 | 90 | ||
91 | #define __count_zone_vm_events(item, zone, delta) \ | 91 | #define __count_zone_vm_events(item, zone, delta) \ |
@@ -967,11 +967,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, | |||
967 | unsigned long flags; | 967 | unsigned long flags; |
968 | unsigned long prot; | 968 | unsigned long prot; |
969 | int acc_mode; | 969 | int acc_mode; |
970 | unsigned long user_addr; | ||
971 | struct ipc_namespace *ns; | 970 | struct ipc_namespace *ns; |
972 | struct shm_file_data *sfd; | 971 | struct shm_file_data *sfd; |
973 | struct path path; | 972 | struct path path; |
974 | fmode_t f_mode; | 973 | fmode_t f_mode; |
974 | unsigned long populate = 0; | ||
975 | 975 | ||
976 | err = -EINVAL; | 976 | err = -EINVAL; |
977 | if (shmid < 0) | 977 | if (shmid < 0) |
@@ -1070,13 +1070,15 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, | |||
1070 | goto invalid; | 1070 | goto invalid; |
1071 | } | 1071 | } |
1072 | 1072 | ||
1073 | user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0); | 1073 | addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); |
1074 | *raddr = user_addr; | 1074 | *raddr = addr; |
1075 | err = 0; | 1075 | err = 0; |
1076 | if (IS_ERR_VALUE(user_addr)) | 1076 | if (IS_ERR_VALUE(addr)) |
1077 | err = (long)user_addr; | 1077 | err = (long)addr; |
1078 | invalid: | 1078 | invalid: |
1079 | up_write(¤t->mm->mmap_sem); | 1079 | up_write(¤t->mm->mmap_sem); |
1080 | if (populate) | ||
1081 | mm_populate(addr, populate); | ||
1080 | 1082 | ||
1081 | out_fput: | 1083 | out_fput: |
1082 | fput(file); | 1084 | fput(file); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a673a3b0c6b..053dfd7692d1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
1132 | */ | 1132 | */ |
1133 | static int select_fallback_rq(int cpu, struct task_struct *p) | 1133 | static int select_fallback_rq(int cpu, struct task_struct *p) |
1134 | { | 1134 | { |
1135 | const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); | 1135 | int nid = cpu_to_node(cpu); |
1136 | const struct cpumask *nodemask = NULL; | ||
1136 | enum { cpuset, possible, fail } state = cpuset; | 1137 | enum { cpuset, possible, fail } state = cpuset; |
1137 | int dest_cpu; | 1138 | int dest_cpu; |
1138 | 1139 | ||
1139 | /* Look for allowed, online CPU in same node. */ | 1140 | /* |
1140 | for_each_cpu(dest_cpu, nodemask) { | 1141 | * If the node that the cpu is on has been offlined, cpu_to_node() |
1141 | if (!cpu_online(dest_cpu)) | 1142 | * will return -1. There is no cpu on the node, and we should |
1142 | continue; | 1143 | * select the cpu on the other node. |
1143 | if (!cpu_active(dest_cpu)) | 1144 | */ |
1144 | continue; | 1145 | if (nid != -1) { |
1145 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) | 1146 | nodemask = cpumask_of_node(nid); |
1146 | return dest_cpu; | 1147 | |
1148 | /* Look for allowed, online CPU in same node. */ | ||
1149 | for_each_cpu(dest_cpu, nodemask) { | ||
1150 | if (!cpu_online(dest_cpu)) | ||
1151 | continue; | ||
1152 | if (!cpu_active(dest_cpu)) | ||
1153 | continue; | ||
1154 | if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) | ||
1155 | return dest_cpu; | ||
1156 | } | ||
1147 | } | 1157 | } |
1148 | 1158 | ||
1149 | for (;;) { | 1159 | for (;;) { |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 467d8b923fcd..95e9e55602a8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -105,7 +105,6 @@ extern char core_pattern[]; | |||
105 | extern unsigned int core_pipe_limit; | 105 | extern unsigned int core_pipe_limit; |
106 | #endif | 106 | #endif |
107 | extern int pid_max; | 107 | extern int pid_max; |
108 | extern int min_free_kbytes; | ||
109 | extern int pid_max_min, pid_max_max; | 108 | extern int pid_max_min, pid_max_max; |
110 | extern int sysctl_drop_caches; | 109 | extern int sysctl_drop_caches; |
111 | extern int percpu_pagelist_fraction; | 110 | extern int percpu_pagelist_fraction; |
diff --git a/mm/Kconfig b/mm/Kconfig index 0b23db9a8791..2c7aea7106f9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -162,10 +162,16 @@ config MOVABLE_NODE | |||
162 | Say Y here if you want to hotplug a whole node. | 162 | Say Y here if you want to hotplug a whole node. |
163 | Say N here if you want kernel to use memory on all nodes evenly. | 163 | Say N here if you want kernel to use memory on all nodes evenly. |
164 | 164 | ||
165 | # | ||
166 | # Only be set on architectures that have completely implemented memory hotplug | ||
167 | # feature. If you are not sure, don't touch it. | ||
168 | # | ||
169 | config HAVE_BOOTMEM_INFO_NODE | ||
170 | def_bool n | ||
171 | |||
165 | # eventually, we can have this option just 'select SPARSEMEM' | 172 | # eventually, we can have this option just 'select SPARSEMEM' |
166 | config MEMORY_HOTPLUG | 173 | config MEMORY_HOTPLUG |
167 | bool "Allow for memory hot-add" | 174 | bool "Allow for memory hot-add" |
168 | select MEMORY_ISOLATION | ||
169 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 175 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
170 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG | 176 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG |
171 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) | 177 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) |
@@ -176,6 +182,8 @@ config MEMORY_HOTPLUG_SPARSE | |||
176 | 182 | ||
177 | config MEMORY_HOTREMOVE | 183 | config MEMORY_HOTREMOVE |
178 | bool "Allow for memory hot remove" | 184 | bool "Allow for memory hot remove" |
185 | select MEMORY_ISOLATION | ||
186 | select HAVE_BOOTMEM_INFO_NODE if X86_64 | ||
179 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE | 187 | depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE |
180 | depends on MIGRATION | 188 | depends on MIGRATION |
181 | 189 | ||
diff --git a/mm/compaction.c b/mm/compaction.c index c62bd063d766..05ccb4cc0bdb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/sysctl.h> | 15 | #include <linux/sysctl.h> |
16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
17 | #include <linux/balloon_compaction.h> | 17 | #include <linux/balloon_compaction.h> |
18 | #include <linux/page-isolation.h> | ||
18 | #include "internal.h" | 19 | #include "internal.h" |
19 | 20 | ||
20 | #ifdef CONFIG_COMPACTION | 21 | #ifdef CONFIG_COMPACTION |
@@ -85,7 +86,7 @@ static inline bool isolation_suitable(struct compact_control *cc, | |||
85 | static void __reset_isolation_suitable(struct zone *zone) | 86 | static void __reset_isolation_suitable(struct zone *zone) |
86 | { | 87 | { |
87 | unsigned long start_pfn = zone->zone_start_pfn; | 88 | unsigned long start_pfn = zone->zone_start_pfn; |
88 | unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 89 | unsigned long end_pfn = zone_end_pfn(zone); |
89 | unsigned long pfn; | 90 | unsigned long pfn; |
90 | 91 | ||
91 | zone->compact_cached_migrate_pfn = start_pfn; | 92 | zone->compact_cached_migrate_pfn = start_pfn; |
@@ -215,7 +216,10 @@ static bool suitable_migration_target(struct page *page) | |||
215 | int migratetype = get_pageblock_migratetype(page); | 216 | int migratetype = get_pageblock_migratetype(page); |
216 | 217 | ||
217 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | 218 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ |
218 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | 219 | if (migratetype == MIGRATE_RESERVE) |
220 | return false; | ||
221 | |||
222 | if (is_migrate_isolate(migratetype)) | ||
219 | return false; | 223 | return false; |
220 | 224 | ||
221 | /* If the page is a large free page, then allow migration */ | 225 | /* If the page is a large free page, then allow migration */ |
@@ -611,8 +615,7 @@ check_compact_cluster: | |||
611 | continue; | 615 | continue; |
612 | 616 | ||
613 | next_pageblock: | 617 | next_pageblock: |
614 | low_pfn += pageblock_nr_pages; | 618 | low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; |
615 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | ||
616 | last_pageblock_nr = pageblock_nr; | 619 | last_pageblock_nr = pageblock_nr; |
617 | } | 620 | } |
618 | 621 | ||
@@ -644,7 +647,7 @@ static void isolate_freepages(struct zone *zone, | |||
644 | struct compact_control *cc) | 647 | struct compact_control *cc) |
645 | { | 648 | { |
646 | struct page *page; | 649 | struct page *page; |
647 | unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; | 650 | unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn; |
648 | int nr_freepages = cc->nr_freepages; | 651 | int nr_freepages = cc->nr_freepages; |
649 | struct list_head *freelist = &cc->freepages; | 652 | struct list_head *freelist = &cc->freepages; |
650 | 653 | ||
@@ -663,7 +666,7 @@ static void isolate_freepages(struct zone *zone, | |||
663 | */ | 666 | */ |
664 | high_pfn = min(low_pfn, pfn); | 667 | high_pfn = min(low_pfn, pfn); |
665 | 668 | ||
666 | zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 669 | z_end_pfn = zone_end_pfn(zone); |
667 | 670 | ||
668 | /* | 671 | /* |
669 | * Isolate free pages until enough are available to migrate the | 672 | * Isolate free pages until enough are available to migrate the |
@@ -706,7 +709,7 @@ static void isolate_freepages(struct zone *zone, | |||
706 | * only scans within a pageblock | 709 | * only scans within a pageblock |
707 | */ | 710 | */ |
708 | end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | 711 | end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
709 | end_pfn = min(end_pfn, zone_end_pfn); | 712 | end_pfn = min(end_pfn, z_end_pfn); |
710 | isolated = isolate_freepages_block(cc, pfn, end_pfn, | 713 | isolated = isolate_freepages_block(cc, pfn, end_pfn, |
711 | freelist, false); | 714 | freelist, false); |
712 | nr_freepages += isolated; | 715 | nr_freepages += isolated; |
@@ -795,7 +798,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
795 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | 798 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); |
796 | 799 | ||
797 | /* Only scan within a pageblock boundary */ | 800 | /* Only scan within a pageblock boundary */ |
798 | end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); | 801 | end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); |
799 | 802 | ||
800 | /* Do not cross the free scanner or scan within a memory hole */ | 803 | /* Do not cross the free scanner or scan within a memory hole */ |
801 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | 804 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { |
@@ -920,7 +923,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
920 | { | 923 | { |
921 | int ret; | 924 | int ret; |
922 | unsigned long start_pfn = zone->zone_start_pfn; | 925 | unsigned long start_pfn = zone->zone_start_pfn; |
923 | unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 926 | unsigned long end_pfn = zone_end_pfn(zone); |
924 | 927 | ||
925 | ret = compaction_suitable(zone, cc->order); | 928 | ret = compaction_suitable(zone, cc->order); |
926 | switch (ret) { | 929 | switch (ret) { |
@@ -977,7 +980,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
977 | 980 | ||
978 | nr_migrate = cc->nr_migratepages; | 981 | nr_migrate = cc->nr_migratepages; |
979 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 982 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
980 | (unsigned long)cc, false, | 983 | (unsigned long)cc, |
981 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, | 984 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, |
982 | MR_COMPACTION); | 985 | MR_COMPACTION); |
983 | update_nr_listpages(cc); | 986 | update_nr_listpages(cc); |
@@ -1086,7 +1089,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1086 | 1089 | ||
1087 | 1090 | ||
1088 | /* Compact all zones within a node */ | 1091 | /* Compact all zones within a node */ |
1089 | static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | 1092 | static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) |
1090 | { | 1093 | { |
1091 | int zoneid; | 1094 | int zoneid; |
1092 | struct zone *zone; | 1095 | struct zone *zone; |
@@ -1119,28 +1122,26 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
1119 | VM_BUG_ON(!list_empty(&cc->freepages)); | 1122 | VM_BUG_ON(!list_empty(&cc->freepages)); |
1120 | VM_BUG_ON(!list_empty(&cc->migratepages)); | 1123 | VM_BUG_ON(!list_empty(&cc->migratepages)); |
1121 | } | 1124 | } |
1122 | |||
1123 | return 0; | ||
1124 | } | 1125 | } |
1125 | 1126 | ||
1126 | int compact_pgdat(pg_data_t *pgdat, int order) | 1127 | void compact_pgdat(pg_data_t *pgdat, int order) |
1127 | { | 1128 | { |
1128 | struct compact_control cc = { | 1129 | struct compact_control cc = { |
1129 | .order = order, | 1130 | .order = order, |
1130 | .sync = false, | 1131 | .sync = false, |
1131 | }; | 1132 | }; |
1132 | 1133 | ||
1133 | return __compact_pgdat(pgdat, &cc); | 1134 | __compact_pgdat(pgdat, &cc); |
1134 | } | 1135 | } |
1135 | 1136 | ||
1136 | static int compact_node(int nid) | 1137 | static void compact_node(int nid) |
1137 | { | 1138 | { |
1138 | struct compact_control cc = { | 1139 | struct compact_control cc = { |
1139 | .order = -1, | 1140 | .order = -1, |
1140 | .sync = true, | 1141 | .sync = true, |
1141 | }; | 1142 | }; |
1142 | 1143 | ||
1143 | return __compact_pgdat(NODE_DATA(nid), &cc); | 1144 | __compact_pgdat(NODE_DATA(nid), &cc); |
1144 | } | 1145 | } |
1145 | 1146 | ||
1146 | /* Compact all nodes in the system */ | 1147 | /* Compact all nodes in the system */ |
diff --git a/mm/fadvise.c b/mm/fadvise.c index a47f0f50c89f..909ec558625c 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/fadvise.h> | 17 | #include <linux/fadvise.h> |
18 | #include <linux/writeback.h> | 18 | #include <linux/writeback.h> |
19 | #include <linux/syscalls.h> | 19 | #include <linux/syscalls.h> |
20 | #include <linux/swap.h> | ||
20 | 21 | ||
21 | #include <asm/unistd.h> | 22 | #include <asm/unistd.h> |
22 | 23 | ||
@@ -120,9 +121,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
120 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; | 121 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; |
121 | end_index = (endbyte >> PAGE_CACHE_SHIFT); | 122 | end_index = (endbyte >> PAGE_CACHE_SHIFT); |
122 | 123 | ||
123 | if (end_index >= start_index) | 124 | if (end_index >= start_index) { |
124 | invalidate_mapping_pages(mapping, start_index, | 125 | unsigned long count = invalidate_mapping_pages(mapping, |
126 | start_index, end_index); | ||
127 | |||
128 | /* | ||
129 | * If fewer pages were invalidated than expected then | ||
130 | * it is possible that some of the pages were on | ||
131 | * a per-cpu pagevec for a remote CPU. Drain all | ||
132 | * pagevecs and try again. | ||
133 | */ | ||
134 | if (count < (end_index - start_index + 1)) { | ||
135 | lru_add_drain_all(); | ||
136 | invalidate_mapping_pages(mapping, start_index, | ||
125 | end_index); | 137 | end_index); |
138 | } | ||
139 | } | ||
126 | break; | 140 | break; |
127 | default: | 141 | default: |
128 | ret = -EINVAL; | 142 | ret = -EINVAL; |
diff --git a/mm/fremap.c b/mm/fremap.c index a0aaf0e56800..0cd4c11488ed 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -129,6 +129,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
129 | struct vm_area_struct *vma; | 129 | struct vm_area_struct *vma; |
130 | int err = -EINVAL; | 130 | int err = -EINVAL; |
131 | int has_write_lock = 0; | 131 | int has_write_lock = 0; |
132 | vm_flags_t vm_flags; | ||
132 | 133 | ||
133 | if (prot) | 134 | if (prot) |
134 | return err; | 135 | return err; |
@@ -160,15 +161,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
160 | /* | 161 | /* |
161 | * Make sure the vma is shared, that it supports prefaulting, | 162 | * Make sure the vma is shared, that it supports prefaulting, |
162 | * and that the remapped range is valid and fully within | 163 | * and that the remapped range is valid and fully within |
163 | * the single existing vma. vm_private_data is used as a | 164 | * the single existing vma. |
164 | * swapout cursor in a VM_NONLINEAR vma. | ||
165 | */ | 165 | */ |
166 | if (!vma || !(vma->vm_flags & VM_SHARED)) | 166 | if (!vma || !(vma->vm_flags & VM_SHARED)) |
167 | goto out; | 167 | goto out; |
168 | 168 | ||
169 | if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) | ||
170 | goto out; | ||
171 | |||
172 | if (!vma->vm_ops || !vma->vm_ops->remap_pages) | 169 | if (!vma->vm_ops || !vma->vm_ops->remap_pages) |
173 | goto out; | 170 | goto out; |
174 | 171 | ||
@@ -177,6 +174,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
177 | 174 | ||
178 | /* Must set VM_NONLINEAR before any pages are populated. */ | 175 | /* Must set VM_NONLINEAR before any pages are populated. */ |
179 | if (!(vma->vm_flags & VM_NONLINEAR)) { | 176 | if (!(vma->vm_flags & VM_NONLINEAR)) { |
177 | /* | ||
178 | * vm_private_data is used as a swapout cursor | ||
179 | * in a VM_NONLINEAR vma. | ||
180 | */ | ||
181 | if (vma->vm_private_data) | ||
182 | goto out; | ||
183 | |||
180 | /* Don't need a nonlinear mapping, exit success */ | 184 | /* Don't need a nonlinear mapping, exit success */ |
181 | if (pgoff == linear_page_index(vma, start)) { | 185 | if (pgoff == linear_page_index(vma, start)) { |
182 | err = 0; | 186 | err = 0; |
@@ -184,6 +188,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
184 | } | 188 | } |
185 | 189 | ||
186 | if (!has_write_lock) { | 190 | if (!has_write_lock) { |
191 | get_write_lock: | ||
187 | up_read(&mm->mmap_sem); | 192 | up_read(&mm->mmap_sem); |
188 | down_write(&mm->mmap_sem); | 193 | down_write(&mm->mmap_sem); |
189 | has_write_lock = 1; | 194 | has_write_lock = 1; |
@@ -199,9 +204,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
199 | unsigned long addr; | 204 | unsigned long addr; |
200 | struct file *file = get_file(vma->vm_file); | 205 | struct file *file = get_file(vma->vm_file); |
201 | 206 | ||
202 | flags &= MAP_NONBLOCK; | 207 | vm_flags = vma->vm_flags; |
203 | addr = mmap_region(file, start, size, | 208 | if (!(flags & MAP_NONBLOCK)) |
204 | flags, vma->vm_flags, pgoff); | 209 | vm_flags |= VM_POPULATE; |
210 | addr = mmap_region(file, start, size, vm_flags, pgoff); | ||
205 | fput(file); | 211 | fput(file); |
206 | if (IS_ERR_VALUE(addr)) { | 212 | if (IS_ERR_VALUE(addr)) { |
207 | err = addr; | 213 | err = addr; |
@@ -220,32 +226,26 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
220 | mutex_unlock(&mapping->i_mmap_mutex); | 226 | mutex_unlock(&mapping->i_mmap_mutex); |
221 | } | 227 | } |
222 | 228 | ||
229 | if (!(flags & MAP_NONBLOCK) && !(vma->vm_flags & VM_POPULATE)) { | ||
230 | if (!has_write_lock) | ||
231 | goto get_write_lock; | ||
232 | vma->vm_flags |= VM_POPULATE; | ||
233 | } | ||
234 | |||
223 | if (vma->vm_flags & VM_LOCKED) { | 235 | if (vma->vm_flags & VM_LOCKED) { |
224 | /* | 236 | /* |
225 | * drop PG_Mlocked flag for over-mapped range | 237 | * drop PG_Mlocked flag for over-mapped range |
226 | */ | 238 | */ |
227 | vm_flags_t saved_flags = vma->vm_flags; | 239 | if (!has_write_lock) |
240 | goto get_write_lock; | ||
241 | vm_flags = vma->vm_flags; | ||
228 | munlock_vma_pages_range(vma, start, start + size); | 242 | munlock_vma_pages_range(vma, start, start + size); |
229 | vma->vm_flags = saved_flags; | 243 | vma->vm_flags = vm_flags; |
230 | } | 244 | } |
231 | 245 | ||
232 | mmu_notifier_invalidate_range_start(mm, start, start + size); | 246 | mmu_notifier_invalidate_range_start(mm, start, start + size); |
233 | err = vma->vm_ops->remap_pages(vma, start, size, pgoff); | 247 | err = vma->vm_ops->remap_pages(vma, start, size, pgoff); |
234 | mmu_notifier_invalidate_range_end(mm, start, start + size); | 248 | mmu_notifier_invalidate_range_end(mm, start, start + size); |
235 | if (!err && !(flags & MAP_NONBLOCK)) { | ||
236 | if (vma->vm_flags & VM_LOCKED) { | ||
237 | /* | ||
238 | * might be mapping previously unmapped range of file | ||
239 | */ | ||
240 | mlock_vma_pages_range(vma, start, start + size); | ||
241 | } else { | ||
242 | if (unlikely(has_write_lock)) { | ||
243 | downgrade_write(&mm->mmap_sem); | ||
244 | has_write_lock = 0; | ||
245 | } | ||
246 | make_pages_present(start, start+size); | ||
247 | } | ||
248 | } | ||
249 | 249 | ||
250 | /* | 250 | /* |
251 | * We can't clear VM_NONLINEAR because we'd have to do | 251 | * We can't clear VM_NONLINEAR because we'd have to do |
@@ -254,10 +254,13 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
254 | */ | 254 | */ |
255 | 255 | ||
256 | out: | 256 | out: |
257 | vm_flags = vma->vm_flags; | ||
257 | if (likely(!has_write_lock)) | 258 | if (likely(!has_write_lock)) |
258 | up_read(&mm->mmap_sem); | 259 | up_read(&mm->mmap_sem); |
259 | else | 260 | else |
260 | up_write(&mm->mmap_sem); | 261 | up_write(&mm->mmap_sem); |
262 | if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) | ||
263 | mm_populate(start, size); | ||
261 | 264 | ||
262 | return err; | 265 | return err; |
263 | } | 266 | } |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b5783d81eda9..bfa142e67b1c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/mman.h> | 20 | #include <linux/mman.h> |
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/migrate.h> | 22 | #include <linux/migrate.h> |
23 | #include <linux/hashtable.h> | ||
23 | 24 | ||
24 | #include <asm/tlb.h> | 25 | #include <asm/tlb.h> |
25 | #include <asm/pgalloc.h> | 26 | #include <asm/pgalloc.h> |
@@ -62,12 +63,11 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); | |||
62 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; | 63 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; |
63 | 64 | ||
64 | static int khugepaged(void *none); | 65 | static int khugepaged(void *none); |
65 | static int mm_slots_hash_init(void); | ||
66 | static int khugepaged_slab_init(void); | 66 | static int khugepaged_slab_init(void); |
67 | static void khugepaged_slab_free(void); | ||
68 | 67 | ||
69 | #define MM_SLOTS_HASH_HEADS 1024 | 68 | #define MM_SLOTS_HASH_BITS 10 |
70 | static struct hlist_head *mm_slots_hash __read_mostly; | 69 | static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); |
70 | |||
71 | static struct kmem_cache *mm_slot_cache __read_mostly; | 71 | static struct kmem_cache *mm_slot_cache __read_mostly; |
72 | 72 | ||
73 | /** | 73 | /** |
@@ -105,7 +105,6 @@ static int set_recommended_min_free_kbytes(void) | |||
105 | struct zone *zone; | 105 | struct zone *zone; |
106 | int nr_zones = 0; | 106 | int nr_zones = 0; |
107 | unsigned long recommended_min; | 107 | unsigned long recommended_min; |
108 | extern int min_free_kbytes; | ||
109 | 108 | ||
110 | if (!khugepaged_enabled()) | 109 | if (!khugepaged_enabled()) |
111 | return 0; | 110 | return 0; |
@@ -634,12 +633,6 @@ static int __init hugepage_init(void) | |||
634 | if (err) | 633 | if (err) |
635 | goto out; | 634 | goto out; |
636 | 635 | ||
637 | err = mm_slots_hash_init(); | ||
638 | if (err) { | ||
639 | khugepaged_slab_free(); | ||
640 | goto out; | ||
641 | } | ||
642 | |||
643 | register_shrinker(&huge_zero_page_shrinker); | 636 | register_shrinker(&huge_zero_page_shrinker); |
644 | 637 | ||
645 | /* | 638 | /* |
@@ -1302,7 +1295,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1302 | int target_nid; | 1295 | int target_nid; |
1303 | int current_nid = -1; | 1296 | int current_nid = -1; |
1304 | bool migrated; | 1297 | bool migrated; |
1305 | bool page_locked = false; | ||
1306 | 1298 | ||
1307 | spin_lock(&mm->page_table_lock); | 1299 | spin_lock(&mm->page_table_lock); |
1308 | if (unlikely(!pmd_same(pmd, *pmdp))) | 1300 | if (unlikely(!pmd_same(pmd, *pmdp))) |
@@ -1324,7 +1316,6 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1324 | /* Acquire the page lock to serialise THP migrations */ | 1316 | /* Acquire the page lock to serialise THP migrations */ |
1325 | spin_unlock(&mm->page_table_lock); | 1317 | spin_unlock(&mm->page_table_lock); |
1326 | lock_page(page); | 1318 | lock_page(page); |
1327 | page_locked = true; | ||
1328 | 1319 | ||
1329 | /* Confirm the PTE did not while locked */ | 1320 | /* Confirm the PTE did not while locked */ |
1330 | spin_lock(&mm->page_table_lock); | 1321 | spin_lock(&mm->page_table_lock); |
@@ -1337,34 +1328,26 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1337 | 1328 | ||
1338 | /* Migrate the THP to the requested node */ | 1329 | /* Migrate the THP to the requested node */ |
1339 | migrated = migrate_misplaced_transhuge_page(mm, vma, | 1330 | migrated = migrate_misplaced_transhuge_page(mm, vma, |
1340 | pmdp, pmd, addr, | 1331 | pmdp, pmd, addr, page, target_nid); |
1341 | page, target_nid); | 1332 | if (!migrated) |
1342 | if (migrated) | 1333 | goto check_same; |
1343 | current_nid = target_nid; | ||
1344 | else { | ||
1345 | spin_lock(&mm->page_table_lock); | ||
1346 | if (unlikely(!pmd_same(pmd, *pmdp))) { | ||
1347 | unlock_page(page); | ||
1348 | goto out_unlock; | ||
1349 | } | ||
1350 | goto clear_pmdnuma; | ||
1351 | } | ||
1352 | 1334 | ||
1353 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | 1335 | task_numa_fault(target_nid, HPAGE_PMD_NR, true); |
1354 | return 0; | 1336 | return 0; |
1355 | 1337 | ||
1338 | check_same: | ||
1339 | spin_lock(&mm->page_table_lock); | ||
1340 | if (unlikely(!pmd_same(pmd, *pmdp))) | ||
1341 | goto out_unlock; | ||
1356 | clear_pmdnuma: | 1342 | clear_pmdnuma: |
1357 | pmd = pmd_mknonnuma(pmd); | 1343 | pmd = pmd_mknonnuma(pmd); |
1358 | set_pmd_at(mm, haddr, pmdp, pmd); | 1344 | set_pmd_at(mm, haddr, pmdp, pmd); |
1359 | VM_BUG_ON(pmd_numa(*pmdp)); | 1345 | VM_BUG_ON(pmd_numa(*pmdp)); |
1360 | update_mmu_cache_pmd(vma, addr, pmdp); | 1346 | update_mmu_cache_pmd(vma, addr, pmdp); |
1361 | if (page_locked) | ||
1362 | unlock_page(page); | ||
1363 | |||
1364 | out_unlock: | 1347 | out_unlock: |
1365 | spin_unlock(&mm->page_table_lock); | 1348 | spin_unlock(&mm->page_table_lock); |
1366 | if (current_nid != -1) | 1349 | if (current_nid != -1) |
1367 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | 1350 | task_numa_fault(current_nid, HPAGE_PMD_NR, false); |
1368 | return 0; | 1351 | return 0; |
1369 | } | 1352 | } |
1370 | 1353 | ||
@@ -1656,7 +1639,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1656 | page_tail->mapping = page->mapping; | 1639 | page_tail->mapping = page->mapping; |
1657 | 1640 | ||
1658 | page_tail->index = page->index + i; | 1641 | page_tail->index = page->index + i; |
1659 | page_xchg_last_nid(page_tail, page_last_nid(page)); | 1642 | page_nid_xchg_last(page_tail, page_nid_last(page)); |
1660 | 1643 | ||
1661 | BUG_ON(!PageAnon(page_tail)); | 1644 | BUG_ON(!PageAnon(page_tail)); |
1662 | BUG_ON(!PageUptodate(page_tail)); | 1645 | BUG_ON(!PageUptodate(page_tail)); |
@@ -1846,7 +1829,7 @@ int split_huge_page(struct page *page) | |||
1846 | 1829 | ||
1847 | BUG_ON(PageCompound(page)); | 1830 | BUG_ON(PageCompound(page)); |
1848 | out_unlock: | 1831 | out_unlock: |
1849 | anon_vma_unlock(anon_vma); | 1832 | anon_vma_unlock_write(anon_vma); |
1850 | put_anon_vma(anon_vma); | 1833 | put_anon_vma(anon_vma); |
1851 | out: | 1834 | out: |
1852 | return ret; | 1835 | return ret; |
@@ -1908,12 +1891,6 @@ static int __init khugepaged_slab_init(void) | |||
1908 | return 0; | 1891 | return 0; |
1909 | } | 1892 | } |
1910 | 1893 | ||
1911 | static void __init khugepaged_slab_free(void) | ||
1912 | { | ||
1913 | kmem_cache_destroy(mm_slot_cache); | ||
1914 | mm_slot_cache = NULL; | ||
1915 | } | ||
1916 | |||
1917 | static inline struct mm_slot *alloc_mm_slot(void) | 1894 | static inline struct mm_slot *alloc_mm_slot(void) |
1918 | { | 1895 | { |
1919 | if (!mm_slot_cache) /* initialization failed */ | 1896 | if (!mm_slot_cache) /* initialization failed */ |
@@ -1926,47 +1903,23 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) | |||
1926 | kmem_cache_free(mm_slot_cache, mm_slot); | 1903 | kmem_cache_free(mm_slot_cache, mm_slot); |
1927 | } | 1904 | } |
1928 | 1905 | ||
1929 | static int __init mm_slots_hash_init(void) | ||
1930 | { | ||
1931 | mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), | ||
1932 | GFP_KERNEL); | ||
1933 | if (!mm_slots_hash) | ||
1934 | return -ENOMEM; | ||
1935 | return 0; | ||
1936 | } | ||
1937 | |||
1938 | #if 0 | ||
1939 | static void __init mm_slots_hash_free(void) | ||
1940 | { | ||
1941 | kfree(mm_slots_hash); | ||
1942 | mm_slots_hash = NULL; | ||
1943 | } | ||
1944 | #endif | ||
1945 | |||
1946 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | 1906 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) |
1947 | { | 1907 | { |
1948 | struct mm_slot *mm_slot; | 1908 | struct mm_slot *mm_slot; |
1949 | struct hlist_head *bucket; | ||
1950 | struct hlist_node *node; | 1909 | struct hlist_node *node; |
1951 | 1910 | ||
1952 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | 1911 | hash_for_each_possible(mm_slots_hash, mm_slot, node, hash, (unsigned long)mm) |
1953 | % MM_SLOTS_HASH_HEADS]; | ||
1954 | hlist_for_each_entry(mm_slot, node, bucket, hash) { | ||
1955 | if (mm == mm_slot->mm) | 1912 | if (mm == mm_slot->mm) |
1956 | return mm_slot; | 1913 | return mm_slot; |
1957 | } | 1914 | |
1958 | return NULL; | 1915 | return NULL; |
1959 | } | 1916 | } |
1960 | 1917 | ||
1961 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | 1918 | static void insert_to_mm_slots_hash(struct mm_struct *mm, |
1962 | struct mm_slot *mm_slot) | 1919 | struct mm_slot *mm_slot) |
1963 | { | 1920 | { |
1964 | struct hlist_head *bucket; | ||
1965 | |||
1966 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | ||
1967 | % MM_SLOTS_HASH_HEADS]; | ||
1968 | mm_slot->mm = mm; | 1921 | mm_slot->mm = mm; |
1969 | hlist_add_head(&mm_slot->hash, bucket); | 1922 | hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); |
1970 | } | 1923 | } |
1971 | 1924 | ||
1972 | static inline int khugepaged_test_exit(struct mm_struct *mm) | 1925 | static inline int khugepaged_test_exit(struct mm_struct *mm) |
@@ -2035,7 +1988,7 @@ void __khugepaged_exit(struct mm_struct *mm) | |||
2035 | spin_lock(&khugepaged_mm_lock); | 1988 | spin_lock(&khugepaged_mm_lock); |
2036 | mm_slot = get_mm_slot(mm); | 1989 | mm_slot = get_mm_slot(mm); |
2037 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { | 1990 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { |
2038 | hlist_del(&mm_slot->hash); | 1991 | hash_del(&mm_slot->hash); |
2039 | list_del(&mm_slot->mm_node); | 1992 | list_del(&mm_slot->mm_node); |
2040 | free = 1; | 1993 | free = 1; |
2041 | } | 1994 | } |
@@ -2368,7 +2321,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2368 | BUG_ON(!pmd_none(*pmd)); | 2321 | BUG_ON(!pmd_none(*pmd)); |
2369 | set_pmd_at(mm, address, pmd, _pmd); | 2322 | set_pmd_at(mm, address, pmd, _pmd); |
2370 | spin_unlock(&mm->page_table_lock); | 2323 | spin_unlock(&mm->page_table_lock); |
2371 | anon_vma_unlock(vma->anon_vma); | 2324 | anon_vma_unlock_write(vma->anon_vma); |
2372 | goto out; | 2325 | goto out; |
2373 | } | 2326 | } |
2374 | 2327 | ||
@@ -2376,7 +2329,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2376 | * All pages are isolated and locked so anon_vma rmap | 2329 | * All pages are isolated and locked so anon_vma rmap |
2377 | * can't run anymore. | 2330 | * can't run anymore. |
2378 | */ | 2331 | */ |
2379 | anon_vma_unlock(vma->anon_vma); | 2332 | anon_vma_unlock_write(vma->anon_vma); |
2380 | 2333 | ||
2381 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); | 2334 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); |
2382 | pte_unmap(pte); | 2335 | pte_unmap(pte); |
@@ -2423,7 +2376,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2423 | struct page *page; | 2376 | struct page *page; |
2424 | unsigned long _address; | 2377 | unsigned long _address; |
2425 | spinlock_t *ptl; | 2378 | spinlock_t *ptl; |
2426 | int node = -1; | 2379 | int node = NUMA_NO_NODE; |
2427 | 2380 | ||
2428 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2381 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
2429 | 2382 | ||
@@ -2453,7 +2406,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2453 | * be more sophisticated and look at more pages, | 2406 | * be more sophisticated and look at more pages, |
2454 | * but isn't for now. | 2407 | * but isn't for now. |
2455 | */ | 2408 | */ |
2456 | if (node == -1) | 2409 | if (node == NUMA_NO_NODE) |
2457 | node = page_to_nid(page); | 2410 | node = page_to_nid(page); |
2458 | VM_BUG_ON(PageCompound(page)); | 2411 | VM_BUG_ON(PageCompound(page)); |
2459 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2412 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
@@ -2484,7 +2437,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) | |||
2484 | 2437 | ||
2485 | if (khugepaged_test_exit(mm)) { | 2438 | if (khugepaged_test_exit(mm)) { |
2486 | /* free mm_slot */ | 2439 | /* free mm_slot */ |
2487 | hlist_del(&mm_slot->hash); | 2440 | hash_del(&mm_slot->hash); |
2488 | list_del(&mm_slot->mm_node); | 2441 | list_del(&mm_slot->mm_node); |
2489 | 2442 | ||
2490 | /* | 2443 | /* |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 546db81820e4..cdb64e4d238a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1293,8 +1293,7 @@ static void __init report_hugepages(void) | |||
1293 | 1293 | ||
1294 | for_each_hstate(h) { | 1294 | for_each_hstate(h) { |
1295 | char buf[32]; | 1295 | char buf[32]; |
1296 | printk(KERN_INFO "HugeTLB registered %s page size, " | 1296 | pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", |
1297 | "pre-allocated %ld pages\n", | ||
1298 | memfmt(buf, huge_page_size(h)), | 1297 | memfmt(buf, huge_page_size(h)), |
1299 | h->free_huge_pages); | 1298 | h->free_huge_pages); |
1300 | } | 1299 | } |
@@ -1702,8 +1701,7 @@ static void __init hugetlb_sysfs_init(void) | |||
1702 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, | 1701 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, |
1703 | hstate_kobjs, &hstate_attr_group); | 1702 | hstate_kobjs, &hstate_attr_group); |
1704 | if (err) | 1703 | if (err) |
1705 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", | 1704 | pr_err("Hugetlb: Unable to add hstate %s", h->name); |
1706 | h->name); | ||
1707 | } | 1705 | } |
1708 | } | 1706 | } |
1709 | 1707 | ||
@@ -1826,9 +1824,8 @@ void hugetlb_register_node(struct node *node) | |||
1826 | nhs->hstate_kobjs, | 1824 | nhs->hstate_kobjs, |
1827 | &per_node_hstate_attr_group); | 1825 | &per_node_hstate_attr_group); |
1828 | if (err) { | 1826 | if (err) { |
1829 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s" | 1827 | pr_err("Hugetlb: Unable to add hstate %s for node %d\n", |
1830 | " for node %d\n", | 1828 | h->name, node->dev.id); |
1831 | h->name, node->dev.id); | ||
1832 | hugetlb_unregister_node(node); | 1829 | hugetlb_unregister_node(node); |
1833 | break; | 1830 | break; |
1834 | } | 1831 | } |
@@ -1924,7 +1921,7 @@ void __init hugetlb_add_hstate(unsigned order) | |||
1924 | unsigned long i; | 1921 | unsigned long i; |
1925 | 1922 | ||
1926 | if (size_to_hstate(PAGE_SIZE << order)) { | 1923 | if (size_to_hstate(PAGE_SIZE << order)) { |
1927 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); | 1924 | pr_warning("hugepagesz= specified twice, ignoring\n"); |
1928 | return; | 1925 | return; |
1929 | } | 1926 | } |
1930 | BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); | 1927 | BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); |
@@ -1960,8 +1957,8 @@ static int __init hugetlb_nrpages_setup(char *s) | |||
1960 | mhp = &parsed_hstate->max_huge_pages; | 1957 | mhp = &parsed_hstate->max_huge_pages; |
1961 | 1958 | ||
1962 | if (mhp == last_mhp) { | 1959 | if (mhp == last_mhp) { |
1963 | printk(KERN_WARNING "hugepages= specified twice without " | 1960 | pr_warning("hugepages= specified twice without " |
1964 | "interleaving hugepagesz=, ignoring\n"); | 1961 | "interleaving hugepagesz=, ignoring\n"); |
1965 | return 1; | 1962 | return 1; |
1966 | } | 1963 | } |
1967 | 1964 | ||
@@ -2692,9 +2689,8 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2692 | * COW. Warn that such a situation has occurred as it may not be obvious | 2689 | * COW. Warn that such a situation has occurred as it may not be obvious |
2693 | */ | 2690 | */ |
2694 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { | 2691 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { |
2695 | printk(KERN_WARNING | 2692 | pr_warning("PID %d killed due to inadequate hugepage pool\n", |
2696 | "PID %d killed due to inadequate hugepage pool\n", | 2693 | current->pid); |
2697 | current->pid); | ||
2698 | return ret; | 2694 | return ret; |
2699 | } | 2695 | } |
2700 | 2696 | ||
@@ -2924,14 +2920,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
2924 | return NULL; | 2920 | return NULL; |
2925 | } | 2921 | } |
2926 | 2922 | ||
2927 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2923 | long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
2928 | struct page **pages, struct vm_area_struct **vmas, | 2924 | struct page **pages, struct vm_area_struct **vmas, |
2929 | unsigned long *position, int *length, int i, | 2925 | unsigned long *position, unsigned long *nr_pages, |
2930 | unsigned int flags) | 2926 | long i, unsigned int flags) |
2931 | { | 2927 | { |
2932 | unsigned long pfn_offset; | 2928 | unsigned long pfn_offset; |
2933 | unsigned long vaddr = *position; | 2929 | unsigned long vaddr = *position; |
2934 | int remainder = *length; | 2930 | unsigned long remainder = *nr_pages; |
2935 | struct hstate *h = hstate_vma(vma); | 2931 | struct hstate *h = hstate_vma(vma); |
2936 | 2932 | ||
2937 | spin_lock(&mm->page_table_lock); | 2933 | spin_lock(&mm->page_table_lock); |
@@ -3001,7 +2997,7 @@ same_page: | |||
3001 | } | 2997 | } |
3002 | } | 2998 | } |
3003 | spin_unlock(&mm->page_table_lock); | 2999 | spin_unlock(&mm->page_table_lock); |
3004 | *length = remainder; | 3000 | *nr_pages = remainder; |
3005 | *position = vaddr; | 3001 | *position = vaddr; |
3006 | 3002 | ||
3007 | return i ? i : -EFAULT; | 3003 | return i ? i : -EFAULT; |
diff --git a/mm/internal.h b/mm/internal.h index 9ba21100ebf3..1c0c4cc0fcf7 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -162,8 +162,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | |||
162 | struct vm_area_struct *prev, struct rb_node *rb_parent); | 162 | struct vm_area_struct *prev, struct rb_node *rb_parent); |
163 | 163 | ||
164 | #ifdef CONFIG_MMU | 164 | #ifdef CONFIG_MMU |
165 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, | 165 | extern long __mlock_vma_pages_range(struct vm_area_struct *vma, |
166 | unsigned long start, unsigned long end); | 166 | unsigned long start, unsigned long end, int *nonblocking); |
167 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, | 167 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, |
168 | unsigned long start, unsigned long end); | 168 | unsigned long start, unsigned long end); |
169 | static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | 169 | static inline void munlock_vma_pages_all(struct vm_area_struct *vma) |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 752a705c77c2..83dd5fbf5e60 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -1300,9 +1300,8 @@ static void kmemleak_scan(void) | |||
1300 | */ | 1300 | */ |
1301 | lock_memory_hotplug(); | 1301 | lock_memory_hotplug(); |
1302 | for_each_online_node(i) { | 1302 | for_each_online_node(i) { |
1303 | pg_data_t *pgdat = NODE_DATA(i); | 1303 | unsigned long start_pfn = node_start_pfn(i); |
1304 | unsigned long start_pfn = pgdat->node_start_pfn; | 1304 | unsigned long end_pfn = node_end_pfn(i); |
1305 | unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; | ||
1306 | unsigned long pfn; | 1305 | unsigned long pfn; |
1307 | 1306 | ||
1308 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 1307 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
@@ -33,13 +33,22 @@ | |||
33 | #include <linux/mmu_notifier.h> | 33 | #include <linux/mmu_notifier.h> |
34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
35 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
36 | #include <linux/hash.h> | 36 | #include <linux/hashtable.h> |
37 | #include <linux/freezer.h> | 37 | #include <linux/freezer.h> |
38 | #include <linux/oom.h> | 38 | #include <linux/oom.h> |
39 | #include <linux/numa.h> | ||
39 | 40 | ||
40 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> |
41 | #include "internal.h" | 42 | #include "internal.h" |
42 | 43 | ||
44 | #ifdef CONFIG_NUMA | ||
45 | #define NUMA(x) (x) | ||
46 | #define DO_NUMA(x) do { (x); } while (0) | ||
47 | #else | ||
48 | #define NUMA(x) (0) | ||
49 | #define DO_NUMA(x) do { } while (0) | ||
50 | #endif | ||
51 | |||
43 | /* | 52 | /* |
44 | * A few notes about the KSM scanning process, | 53 | * A few notes about the KSM scanning process, |
45 | * to make it easier to understand the data structures below: | 54 | * to make it easier to understand the data structures below: |
@@ -78,6 +87,9 @@ | |||
78 | * take 10 attempts to find a page in the unstable tree, once it is found, | 87 | * take 10 attempts to find a page in the unstable tree, once it is found, |
79 | * it is secured in the stable tree. (When we scan a new page, we first | 88 | * it is secured in the stable tree. (When we scan a new page, we first |
80 | * compare it against the stable tree, and then against the unstable tree.) | 89 | * compare it against the stable tree, and then against the unstable tree.) |
90 | * | ||
91 | * If the merge_across_nodes tunable is unset, then KSM maintains multiple | ||
92 | * stable trees and multiple unstable trees: one of each for each NUMA node. | ||
81 | */ | 93 | */ |
82 | 94 | ||
83 | /** | 95 | /** |
@@ -113,19 +125,32 @@ struct ksm_scan { | |||
113 | /** | 125 | /** |
114 | * struct stable_node - node of the stable rbtree | 126 | * struct stable_node - node of the stable rbtree |
115 | * @node: rb node of this ksm page in the stable tree | 127 | * @node: rb node of this ksm page in the stable tree |
128 | * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list | ||
129 | * @list: linked into migrate_nodes, pending placement in the proper node tree | ||
116 | * @hlist: hlist head of rmap_items using this ksm page | 130 | * @hlist: hlist head of rmap_items using this ksm page |
117 | * @kpfn: page frame number of this ksm page | 131 | * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) |
132 | * @nid: NUMA node id of stable tree in which linked (may not match kpfn) | ||
118 | */ | 133 | */ |
119 | struct stable_node { | 134 | struct stable_node { |
120 | struct rb_node node; | 135 | union { |
136 | struct rb_node node; /* when node of stable tree */ | ||
137 | struct { /* when listed for migration */ | ||
138 | struct list_head *head; | ||
139 | struct list_head list; | ||
140 | }; | ||
141 | }; | ||
121 | struct hlist_head hlist; | 142 | struct hlist_head hlist; |
122 | unsigned long kpfn; | 143 | unsigned long kpfn; |
144 | #ifdef CONFIG_NUMA | ||
145 | int nid; | ||
146 | #endif | ||
123 | }; | 147 | }; |
124 | 148 | ||
125 | /** | 149 | /** |
126 | * struct rmap_item - reverse mapping item for virtual addresses | 150 | * struct rmap_item - reverse mapping item for virtual addresses |
127 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list | 151 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list |
128 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree | 152 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree |
153 | * @nid: NUMA node id of unstable tree in which linked (may not match page) | ||
129 | * @mm: the memory structure this rmap_item is pointing into | 154 | * @mm: the memory structure this rmap_item is pointing into |
130 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) | 155 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) |
131 | * @oldchecksum: previous checksum of the page at that virtual address | 156 | * @oldchecksum: previous checksum of the page at that virtual address |
@@ -135,7 +160,12 @@ struct stable_node { | |||
135 | */ | 160 | */ |
136 | struct rmap_item { | 161 | struct rmap_item { |
137 | struct rmap_item *rmap_list; | 162 | struct rmap_item *rmap_list; |
138 | struct anon_vma *anon_vma; /* when stable */ | 163 | union { |
164 | struct anon_vma *anon_vma; /* when stable */ | ||
165 | #ifdef CONFIG_NUMA | ||
166 | int nid; /* when node of unstable tree */ | ||
167 | #endif | ||
168 | }; | ||
139 | struct mm_struct *mm; | 169 | struct mm_struct *mm; |
140 | unsigned long address; /* + low bits used for flags below */ | 170 | unsigned long address; /* + low bits used for flags below */ |
141 | unsigned int oldchecksum; /* when unstable */ | 171 | unsigned int oldchecksum; /* when unstable */ |
@@ -153,12 +183,16 @@ struct rmap_item { | |||
153 | #define STABLE_FLAG 0x200 /* is listed from the stable tree */ | 183 | #define STABLE_FLAG 0x200 /* is listed from the stable tree */ |
154 | 184 | ||
155 | /* The stable and unstable tree heads */ | 185 | /* The stable and unstable tree heads */ |
156 | static struct rb_root root_stable_tree = RB_ROOT; | 186 | static struct rb_root one_stable_tree[1] = { RB_ROOT }; |
157 | static struct rb_root root_unstable_tree = RB_ROOT; | 187 | static struct rb_root one_unstable_tree[1] = { RB_ROOT }; |
188 | static struct rb_root *root_stable_tree = one_stable_tree; | ||
189 | static struct rb_root *root_unstable_tree = one_unstable_tree; | ||
158 | 190 | ||
159 | #define MM_SLOTS_HASH_SHIFT 10 | 191 | /* Recently migrated nodes of stable tree, pending proper placement */ |
160 | #define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) | 192 | static LIST_HEAD(migrate_nodes); |
161 | static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; | 193 | |
194 | #define MM_SLOTS_HASH_BITS 10 | ||
195 | static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); | ||
162 | 196 | ||
163 | static struct mm_slot ksm_mm_head = { | 197 | static struct mm_slot ksm_mm_head = { |
164 | .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), | 198 | .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), |
@@ -189,10 +223,21 @@ static unsigned int ksm_thread_pages_to_scan = 100; | |||
189 | /* Milliseconds ksmd should sleep between batches */ | 223 | /* Milliseconds ksmd should sleep between batches */ |
190 | static unsigned int ksm_thread_sleep_millisecs = 20; | 224 | static unsigned int ksm_thread_sleep_millisecs = 20; |
191 | 225 | ||
226 | #ifdef CONFIG_NUMA | ||
227 | /* Zeroed when merging across nodes is not allowed */ | ||
228 | static unsigned int ksm_merge_across_nodes = 1; | ||
229 | static int ksm_nr_node_ids = 1; | ||
230 | #else | ||
231 | #define ksm_merge_across_nodes 1U | ||
232 | #define ksm_nr_node_ids 1 | ||
233 | #endif | ||
234 | |||
192 | #define KSM_RUN_STOP 0 | 235 | #define KSM_RUN_STOP 0 |
193 | #define KSM_RUN_MERGE 1 | 236 | #define KSM_RUN_MERGE 1 |
194 | #define KSM_RUN_UNMERGE 2 | 237 | #define KSM_RUN_UNMERGE 2 |
195 | static unsigned int ksm_run = KSM_RUN_STOP; | 238 | #define KSM_RUN_OFFLINE 4 |
239 | static unsigned long ksm_run = KSM_RUN_STOP; | ||
240 | static void wait_while_offlining(void); | ||
196 | 241 | ||
197 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); | 242 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); |
198 | static DEFINE_MUTEX(ksm_thread_mutex); | 243 | static DEFINE_MUTEX(ksm_thread_mutex); |
@@ -275,31 +320,21 @@ static inline void free_mm_slot(struct mm_slot *mm_slot) | |||
275 | 320 | ||
276 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | 321 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) |
277 | { | 322 | { |
278 | struct mm_slot *mm_slot; | ||
279 | struct hlist_head *bucket; | ||
280 | struct hlist_node *node; | 323 | struct hlist_node *node; |
324 | struct mm_slot *slot; | ||
325 | |||
326 | hash_for_each_possible(mm_slots_hash, slot, node, link, (unsigned long)mm) | ||
327 | if (slot->mm == mm) | ||
328 | return slot; | ||
281 | 329 | ||
282 | bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; | ||
283 | hlist_for_each_entry(mm_slot, node, bucket, link) { | ||
284 | if (mm == mm_slot->mm) | ||
285 | return mm_slot; | ||
286 | } | ||
287 | return NULL; | 330 | return NULL; |
288 | } | 331 | } |
289 | 332 | ||
290 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | 333 | static void insert_to_mm_slots_hash(struct mm_struct *mm, |
291 | struct mm_slot *mm_slot) | 334 | struct mm_slot *mm_slot) |
292 | { | 335 | { |
293 | struct hlist_head *bucket; | ||
294 | |||
295 | bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; | ||
296 | mm_slot->mm = mm; | 336 | mm_slot->mm = mm; |
297 | hlist_add_head(&mm_slot->link, bucket); | 337 | hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); |
298 | } | ||
299 | |||
300 | static inline int in_stable_tree(struct rmap_item *rmap_item) | ||
301 | { | ||
302 | return rmap_item->address & STABLE_FLAG; | ||
303 | } | 338 | } |
304 | 339 | ||
305 | /* | 340 | /* |
@@ -333,7 +368,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
333 | 368 | ||
334 | do { | 369 | do { |
335 | cond_resched(); | 370 | cond_resched(); |
336 | page = follow_page(vma, addr, FOLL_GET); | 371 | page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); |
337 | if (IS_ERR_OR_NULL(page)) | 372 | if (IS_ERR_OR_NULL(page)) |
338 | break; | 373 | break; |
339 | if (PageKsm(page)) | 374 | if (PageKsm(page)) |
@@ -447,6 +482,17 @@ out: page = NULL; | |||
447 | return page; | 482 | return page; |
448 | } | 483 | } |
449 | 484 | ||
485 | /* | ||
486 | * This helper is used for getting right index into array of tree roots. | ||
487 | * When merge_across_nodes knob is set to 1, there are only two rb-trees for | ||
488 | * stable and unstable pages from all nodes with roots in index 0. Otherwise, | ||
489 | * every node has its own stable and unstable tree. | ||
490 | */ | ||
491 | static inline int get_kpfn_nid(unsigned long kpfn) | ||
492 | { | ||
493 | return ksm_merge_across_nodes ? 0 : pfn_to_nid(kpfn); | ||
494 | } | ||
495 | |||
450 | static void remove_node_from_stable_tree(struct stable_node *stable_node) | 496 | static void remove_node_from_stable_tree(struct stable_node *stable_node) |
451 | { | 497 | { |
452 | struct rmap_item *rmap_item; | 498 | struct rmap_item *rmap_item; |
@@ -462,7 +508,11 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
462 | cond_resched(); | 508 | cond_resched(); |
463 | } | 509 | } |
464 | 510 | ||
465 | rb_erase(&stable_node->node, &root_stable_tree); | 511 | if (stable_node->head == &migrate_nodes) |
512 | list_del(&stable_node->list); | ||
513 | else | ||
514 | rb_erase(&stable_node->node, | ||
515 | root_stable_tree + NUMA(stable_node->nid)); | ||
466 | free_stable_node(stable_node); | 516 | free_stable_node(stable_node); |
467 | } | 517 | } |
468 | 518 | ||
@@ -472,6 +522,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
472 | * In which case we can trust the content of the page, and it | 522 | * In which case we can trust the content of the page, and it |
473 | * returns the gotten page; but if the page has now been zapped, | 523 | * returns the gotten page; but if the page has now been zapped, |
474 | * remove the stale node from the stable tree and return NULL. | 524 | * remove the stale node from the stable tree and return NULL. |
525 | * But beware, the stable node's page might be being migrated. | ||
475 | * | 526 | * |
476 | * You would expect the stable_node to hold a reference to the ksm page. | 527 | * You would expect the stable_node to hold a reference to the ksm page. |
477 | * But if it increments the page's count, swapping out has to wait for | 528 | * But if it increments the page's count, swapping out has to wait for |
@@ -482,40 +533,77 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
482 | * pointing back to this stable node. This relies on freeing a PageAnon | 533 | * pointing back to this stable node. This relies on freeing a PageAnon |
483 | * page to reset its page->mapping to NULL, and relies on no other use of | 534 | * page to reset its page->mapping to NULL, and relies on no other use of |
484 | * a page to put something that might look like our key in page->mapping. | 535 | * a page to put something that might look like our key in page->mapping. |
485 | * | ||
486 | * include/linux/pagemap.h page_cache_get_speculative() is a good reference, | ||
487 | * but this is different - made simpler by ksm_thread_mutex being held, but | ||
488 | * interesting for assuming that no other use of the struct page could ever | ||
489 | * put our expected_mapping into page->mapping (or a field of the union which | ||
490 | * coincides with page->mapping). The RCU calls are not for KSM at all, but | ||
491 | * to keep the page_count protocol described with page_cache_get_speculative. | ||
492 | * | ||
493 | * Note: it is possible that get_ksm_page() will return NULL one moment, | ||
494 | * then page the next, if the page is in between page_freeze_refs() and | ||
495 | * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page | ||
496 | * is on its way to being freed; but it is an anomaly to bear in mind. | 536 | * is on its way to being freed; but it is an anomaly to bear in mind. |
497 | */ | 537 | */ |
498 | static struct page *get_ksm_page(struct stable_node *stable_node) | 538 | static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) |
499 | { | 539 | { |
500 | struct page *page; | 540 | struct page *page; |
501 | void *expected_mapping; | 541 | void *expected_mapping; |
542 | unsigned long kpfn; | ||
502 | 543 | ||
503 | page = pfn_to_page(stable_node->kpfn); | ||
504 | expected_mapping = (void *)stable_node + | 544 | expected_mapping = (void *)stable_node + |
505 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | 545 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); |
506 | rcu_read_lock(); | 546 | again: |
507 | if (page->mapping != expected_mapping) | 547 | kpfn = ACCESS_ONCE(stable_node->kpfn); |
508 | goto stale; | 548 | page = pfn_to_page(kpfn); |
509 | if (!get_page_unless_zero(page)) | 549 | |
550 | /* | ||
551 | * page is computed from kpfn, so on most architectures reading | ||
552 | * page->mapping is naturally ordered after reading node->kpfn, | ||
553 | * but on Alpha we need to be more careful. | ||
554 | */ | ||
555 | smp_read_barrier_depends(); | ||
556 | if (ACCESS_ONCE(page->mapping) != expected_mapping) | ||
510 | goto stale; | 557 | goto stale; |
511 | if (page->mapping != expected_mapping) { | 558 | |
559 | /* | ||
560 | * We cannot do anything with the page while its refcount is 0. | ||
561 | * Usually 0 means free, or tail of a higher-order page: in which | ||
562 | * case this node is no longer referenced, and should be freed; | ||
563 | * however, it might mean that the page is under page_freeze_refs(). | ||
564 | * The __remove_mapping() case is easy, again the node is now stale; | ||
565 | * but if page is swapcache in migrate_page_move_mapping(), it might | ||
566 | * still be our page, in which case it's essential to keep the node. | ||
567 | */ | ||
568 | while (!get_page_unless_zero(page)) { | ||
569 | /* | ||
570 | * Another check for page->mapping != expected_mapping would | ||
571 | * work here too. We have chosen the !PageSwapCache test to | ||
572 | * optimize the common case, when the page is or is about to | ||
573 | * be freed: PageSwapCache is cleared (under spin_lock_irq) | ||
574 | * in the freeze_refs section of __remove_mapping(); but Anon | ||
575 | * page->mapping reset to NULL later, in free_pages_prepare(). | ||
576 | */ | ||
577 | if (!PageSwapCache(page)) | ||
578 | goto stale; | ||
579 | cpu_relax(); | ||
580 | } | ||
581 | |||
582 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | ||
512 | put_page(page); | 583 | put_page(page); |
513 | goto stale; | 584 | goto stale; |
514 | } | 585 | } |
515 | rcu_read_unlock(); | 586 | |
587 | if (lock_it) { | ||
588 | lock_page(page); | ||
589 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | ||
590 | unlock_page(page); | ||
591 | put_page(page); | ||
592 | goto stale; | ||
593 | } | ||
594 | } | ||
516 | return page; | 595 | return page; |
596 | |||
517 | stale: | 597 | stale: |
518 | rcu_read_unlock(); | 598 | /* |
599 | * We come here from above when page->mapping or !PageSwapCache | ||
600 | * suggests that the node is stale; but it might be under migration. | ||
601 | * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), | ||
602 | * before checking whether node->kpfn has been changed. | ||
603 | */ | ||
604 | smp_rmb(); | ||
605 | if (ACCESS_ONCE(stable_node->kpfn) != kpfn) | ||
606 | goto again; | ||
519 | remove_node_from_stable_tree(stable_node); | 607 | remove_node_from_stable_tree(stable_node); |
520 | return NULL; | 608 | return NULL; |
521 | } | 609 | } |
@@ -531,11 +619,10 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
531 | struct page *page; | 619 | struct page *page; |
532 | 620 | ||
533 | stable_node = rmap_item->head; | 621 | stable_node = rmap_item->head; |
534 | page = get_ksm_page(stable_node); | 622 | page = get_ksm_page(stable_node, true); |
535 | if (!page) | 623 | if (!page) |
536 | goto out; | 624 | goto out; |
537 | 625 | ||
538 | lock_page(page); | ||
539 | hlist_del(&rmap_item->hlist); | 626 | hlist_del(&rmap_item->hlist); |
540 | unlock_page(page); | 627 | unlock_page(page); |
541 | put_page(page); | 628 | put_page(page); |
@@ -560,8 +647,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
560 | age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); | 647 | age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); |
561 | BUG_ON(age > 1); | 648 | BUG_ON(age > 1); |
562 | if (!age) | 649 | if (!age) |
563 | rb_erase(&rmap_item->node, &root_unstable_tree); | 650 | rb_erase(&rmap_item->node, |
564 | 651 | root_unstable_tree + NUMA(rmap_item->nid)); | |
565 | ksm_pages_unshared--; | 652 | ksm_pages_unshared--; |
566 | rmap_item->address &= PAGE_MASK; | 653 | rmap_item->address &= PAGE_MASK; |
567 | } | 654 | } |
@@ -581,7 +668,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot, | |||
581 | } | 668 | } |
582 | 669 | ||
583 | /* | 670 | /* |
584 | * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather | 671 | * Though it's very tempting to unmerge rmap_items from stable tree rather |
585 | * than check every pte of a given vma, the locking doesn't quite work for | 672 | * than check every pte of a given vma, the locking doesn't quite work for |
586 | * that - an rmap_item is assigned to the stable tree after inserting ksm | 673 | * that - an rmap_item is assigned to the stable tree after inserting ksm |
587 | * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing | 674 | * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing |
@@ -614,6 +701,71 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, | |||
614 | /* | 701 | /* |
615 | * Only called through the sysfs control interface: | 702 | * Only called through the sysfs control interface: |
616 | */ | 703 | */ |
704 | static int remove_stable_node(struct stable_node *stable_node) | ||
705 | { | ||
706 | struct page *page; | ||
707 | int err; | ||
708 | |||
709 | page = get_ksm_page(stable_node, true); | ||
710 | if (!page) { | ||
711 | /* | ||
712 | * get_ksm_page did remove_node_from_stable_tree itself. | ||
713 | */ | ||
714 | return 0; | ||
715 | } | ||
716 | |||
717 | if (WARN_ON_ONCE(page_mapped(page))) { | ||
718 | /* | ||
719 | * This should not happen: but if it does, just refuse to let | ||
720 | * merge_across_nodes be switched - there is no need to panic. | ||
721 | */ | ||
722 | err = -EBUSY; | ||
723 | } else { | ||
724 | /* | ||
725 | * The stable node did not yet appear stale to get_ksm_page(), | ||
726 | * since that allows for an unmapped ksm page to be recognized | ||
727 | * right up until it is freed; but the node is safe to remove. | ||
728 | * This page might be in a pagevec waiting to be freed, | ||
729 | * or it might be PageSwapCache (perhaps under writeback), | ||
730 | * or it might have been removed from swapcache a moment ago. | ||
731 | */ | ||
732 | set_page_stable_node(page, NULL); | ||
733 | remove_node_from_stable_tree(stable_node); | ||
734 | err = 0; | ||
735 | } | ||
736 | |||
737 | unlock_page(page); | ||
738 | put_page(page); | ||
739 | return err; | ||
740 | } | ||
741 | |||
742 | static int remove_all_stable_nodes(void) | ||
743 | { | ||
744 | struct stable_node *stable_node; | ||
745 | struct list_head *this, *next; | ||
746 | int nid; | ||
747 | int err = 0; | ||
748 | |||
749 | for (nid = 0; nid < ksm_nr_node_ids; nid++) { | ||
750 | while (root_stable_tree[nid].rb_node) { | ||
751 | stable_node = rb_entry(root_stable_tree[nid].rb_node, | ||
752 | struct stable_node, node); | ||
753 | if (remove_stable_node(stable_node)) { | ||
754 | err = -EBUSY; | ||
755 | break; /* proceed to next nid */ | ||
756 | } | ||
757 | cond_resched(); | ||
758 | } | ||
759 | } | ||
760 | list_for_each_safe(this, next, &migrate_nodes) { | ||
761 | stable_node = list_entry(this, struct stable_node, list); | ||
762 | if (remove_stable_node(stable_node)) | ||
763 | err = -EBUSY; | ||
764 | cond_resched(); | ||
765 | } | ||
766 | return err; | ||
767 | } | ||
768 | |||
617 | static int unmerge_and_remove_all_rmap_items(void) | 769 | static int unmerge_and_remove_all_rmap_items(void) |
618 | { | 770 | { |
619 | struct mm_slot *mm_slot; | 771 | struct mm_slot *mm_slot; |
@@ -647,7 +799,7 @@ static int unmerge_and_remove_all_rmap_items(void) | |||
647 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, | 799 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, |
648 | struct mm_slot, mm_list); | 800 | struct mm_slot, mm_list); |
649 | if (ksm_test_exit(mm)) { | 801 | if (ksm_test_exit(mm)) { |
650 | hlist_del(&mm_slot->link); | 802 | hash_del(&mm_slot->link); |
651 | list_del(&mm_slot->mm_list); | 803 | list_del(&mm_slot->mm_list); |
652 | spin_unlock(&ksm_mmlist_lock); | 804 | spin_unlock(&ksm_mmlist_lock); |
653 | 805 | ||
@@ -661,6 +813,8 @@ static int unmerge_and_remove_all_rmap_items(void) | |||
661 | } | 813 | } |
662 | } | 814 | } |
663 | 815 | ||
816 | /* Clean up stable nodes, but don't worry if some are still busy */ | ||
817 | remove_all_stable_nodes(); | ||
664 | ksm_scan.seqnr = 0; | 818 | ksm_scan.seqnr = 0; |
665 | return 0; | 819 | return 0; |
666 | 820 | ||
@@ -946,6 +1100,9 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, | |||
946 | if (err) | 1100 | if (err) |
947 | goto out; | 1101 | goto out; |
948 | 1102 | ||
1103 | /* Unstable nid is in union with stable anon_vma: remove first */ | ||
1104 | remove_rmap_item_from_tree(rmap_item); | ||
1105 | |||
949 | /* Must get reference to anon_vma while still holding mmap_sem */ | 1106 | /* Must get reference to anon_vma while still holding mmap_sem */ |
950 | rmap_item->anon_vma = vma->anon_vma; | 1107 | rmap_item->anon_vma = vma->anon_vma; |
951 | get_anon_vma(vma->anon_vma); | 1108 | get_anon_vma(vma->anon_vma); |
@@ -996,42 +1153,99 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, | |||
996 | */ | 1153 | */ |
997 | static struct page *stable_tree_search(struct page *page) | 1154 | static struct page *stable_tree_search(struct page *page) |
998 | { | 1155 | { |
999 | struct rb_node *node = root_stable_tree.rb_node; | 1156 | int nid; |
1157 | struct rb_root *root; | ||
1158 | struct rb_node **new; | ||
1159 | struct rb_node *parent; | ||
1000 | struct stable_node *stable_node; | 1160 | struct stable_node *stable_node; |
1161 | struct stable_node *page_node; | ||
1001 | 1162 | ||
1002 | stable_node = page_stable_node(page); | 1163 | page_node = page_stable_node(page); |
1003 | if (stable_node) { /* ksm page forked */ | 1164 | if (page_node && page_node->head != &migrate_nodes) { |
1165 | /* ksm page forked */ | ||
1004 | get_page(page); | 1166 | get_page(page); |
1005 | return page; | 1167 | return page; |
1006 | } | 1168 | } |
1007 | 1169 | ||
1008 | while (node) { | 1170 | nid = get_kpfn_nid(page_to_pfn(page)); |
1171 | root = root_stable_tree + nid; | ||
1172 | again: | ||
1173 | new = &root->rb_node; | ||
1174 | parent = NULL; | ||
1175 | |||
1176 | while (*new) { | ||
1009 | struct page *tree_page; | 1177 | struct page *tree_page; |
1010 | int ret; | 1178 | int ret; |
1011 | 1179 | ||
1012 | cond_resched(); | 1180 | cond_resched(); |
1013 | stable_node = rb_entry(node, struct stable_node, node); | 1181 | stable_node = rb_entry(*new, struct stable_node, node); |
1014 | tree_page = get_ksm_page(stable_node); | 1182 | tree_page = get_ksm_page(stable_node, false); |
1015 | if (!tree_page) | 1183 | if (!tree_page) |
1016 | return NULL; | 1184 | return NULL; |
1017 | 1185 | ||
1018 | ret = memcmp_pages(page, tree_page); | 1186 | ret = memcmp_pages(page, tree_page); |
1187 | put_page(tree_page); | ||
1019 | 1188 | ||
1020 | if (ret < 0) { | 1189 | parent = *new; |
1021 | put_page(tree_page); | 1190 | if (ret < 0) |
1022 | node = node->rb_left; | 1191 | new = &parent->rb_left; |
1023 | } else if (ret > 0) { | 1192 | else if (ret > 0) |
1024 | put_page(tree_page); | 1193 | new = &parent->rb_right; |
1025 | node = node->rb_right; | 1194 | else { |
1026 | } else | 1195 | /* |
1027 | return tree_page; | 1196 | * Lock and unlock the stable_node's page (which |
1197 | * might already have been migrated) so that page | ||
1198 | * migration is sure to notice its raised count. | ||
1199 | * It would be more elegant to return stable_node | ||
1200 | * than kpage, but that involves more changes. | ||
1201 | */ | ||
1202 | tree_page = get_ksm_page(stable_node, true); | ||
1203 | if (tree_page) { | ||
1204 | unlock_page(tree_page); | ||
1205 | if (get_kpfn_nid(stable_node->kpfn) != | ||
1206 | NUMA(stable_node->nid)) { | ||
1207 | put_page(tree_page); | ||
1208 | goto replace; | ||
1209 | } | ||
1210 | return tree_page; | ||
1211 | } | ||
1212 | /* | ||
1213 | * There is now a place for page_node, but the tree may | ||
1214 | * have been rebalanced, so re-evaluate parent and new. | ||
1215 | */ | ||
1216 | if (page_node) | ||
1217 | goto again; | ||
1218 | return NULL; | ||
1219 | } | ||
1028 | } | 1220 | } |
1029 | 1221 | ||
1030 | return NULL; | 1222 | if (!page_node) |
1223 | return NULL; | ||
1224 | |||
1225 | list_del(&page_node->list); | ||
1226 | DO_NUMA(page_node->nid = nid); | ||
1227 | rb_link_node(&page_node->node, parent, new); | ||
1228 | rb_insert_color(&page_node->node, root); | ||
1229 | get_page(page); | ||
1230 | return page; | ||
1231 | |||
1232 | replace: | ||
1233 | if (page_node) { | ||
1234 | list_del(&page_node->list); | ||
1235 | DO_NUMA(page_node->nid = nid); | ||
1236 | rb_replace_node(&stable_node->node, &page_node->node, root); | ||
1237 | get_page(page); | ||
1238 | } else { | ||
1239 | rb_erase(&stable_node->node, root); | ||
1240 | page = NULL; | ||
1241 | } | ||
1242 | stable_node->head = &migrate_nodes; | ||
1243 | list_add(&stable_node->list, stable_node->head); | ||
1244 | return page; | ||
1031 | } | 1245 | } |
1032 | 1246 | ||
1033 | /* | 1247 | /* |
1034 | * stable_tree_insert - insert rmap_item pointing to new ksm page | 1248 | * stable_tree_insert - insert stable tree node pointing to new ksm page |
1035 | * into the stable tree. | 1249 | * into the stable tree. |
1036 | * | 1250 | * |
1037 | * This function returns the stable tree node just allocated on success, | 1251 | * This function returns the stable tree node just allocated on success, |
@@ -1039,17 +1253,25 @@ static struct page *stable_tree_search(struct page *page) | |||
1039 | */ | 1253 | */ |
1040 | static struct stable_node *stable_tree_insert(struct page *kpage) | 1254 | static struct stable_node *stable_tree_insert(struct page *kpage) |
1041 | { | 1255 | { |
1042 | struct rb_node **new = &root_stable_tree.rb_node; | 1256 | int nid; |
1257 | unsigned long kpfn; | ||
1258 | struct rb_root *root; | ||
1259 | struct rb_node **new; | ||
1043 | struct rb_node *parent = NULL; | 1260 | struct rb_node *parent = NULL; |
1044 | struct stable_node *stable_node; | 1261 | struct stable_node *stable_node; |
1045 | 1262 | ||
1263 | kpfn = page_to_pfn(kpage); | ||
1264 | nid = get_kpfn_nid(kpfn); | ||
1265 | root = root_stable_tree + nid; | ||
1266 | new = &root->rb_node; | ||
1267 | |||
1046 | while (*new) { | 1268 | while (*new) { |
1047 | struct page *tree_page; | 1269 | struct page *tree_page; |
1048 | int ret; | 1270 | int ret; |
1049 | 1271 | ||
1050 | cond_resched(); | 1272 | cond_resched(); |
1051 | stable_node = rb_entry(*new, struct stable_node, node); | 1273 | stable_node = rb_entry(*new, struct stable_node, node); |
1052 | tree_page = get_ksm_page(stable_node); | 1274 | tree_page = get_ksm_page(stable_node, false); |
1053 | if (!tree_page) | 1275 | if (!tree_page) |
1054 | return NULL; | 1276 | return NULL; |
1055 | 1277 | ||
@@ -1075,13 +1297,12 @@ static struct stable_node *stable_tree_insert(struct page *kpage) | |||
1075 | if (!stable_node) | 1297 | if (!stable_node) |
1076 | return NULL; | 1298 | return NULL; |
1077 | 1299 | ||
1078 | rb_link_node(&stable_node->node, parent, new); | ||
1079 | rb_insert_color(&stable_node->node, &root_stable_tree); | ||
1080 | |||
1081 | INIT_HLIST_HEAD(&stable_node->hlist); | 1300 | INIT_HLIST_HEAD(&stable_node->hlist); |
1082 | 1301 | stable_node->kpfn = kpfn; | |
1083 | stable_node->kpfn = page_to_pfn(kpage); | ||
1084 | set_page_stable_node(kpage, stable_node); | 1302 | set_page_stable_node(kpage, stable_node); |
1303 | DO_NUMA(stable_node->nid = nid); | ||
1304 | rb_link_node(&stable_node->node, parent, new); | ||
1305 | rb_insert_color(&stable_node->node, root); | ||
1085 | 1306 | ||
1086 | return stable_node; | 1307 | return stable_node; |
1087 | } | 1308 | } |
@@ -1104,10 +1325,15 @@ static | |||
1104 | struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | 1325 | struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, |
1105 | struct page *page, | 1326 | struct page *page, |
1106 | struct page **tree_pagep) | 1327 | struct page **tree_pagep) |
1107 | |||
1108 | { | 1328 | { |
1109 | struct rb_node **new = &root_unstable_tree.rb_node; | 1329 | struct rb_node **new; |
1330 | struct rb_root *root; | ||
1110 | struct rb_node *parent = NULL; | 1331 | struct rb_node *parent = NULL; |
1332 | int nid; | ||
1333 | |||
1334 | nid = get_kpfn_nid(page_to_pfn(page)); | ||
1335 | root = root_unstable_tree + nid; | ||
1336 | new = &root->rb_node; | ||
1111 | 1337 | ||
1112 | while (*new) { | 1338 | while (*new) { |
1113 | struct rmap_item *tree_rmap_item; | 1339 | struct rmap_item *tree_rmap_item; |
@@ -1137,6 +1363,15 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | |||
1137 | } else if (ret > 0) { | 1363 | } else if (ret > 0) { |
1138 | put_page(tree_page); | 1364 | put_page(tree_page); |
1139 | new = &parent->rb_right; | 1365 | new = &parent->rb_right; |
1366 | } else if (!ksm_merge_across_nodes && | ||
1367 | page_to_nid(tree_page) != nid) { | ||
1368 | /* | ||
1369 | * If tree_page has been migrated to another NUMA node, | ||
1370 | * it will be flushed out and put in the right unstable | ||
1371 | * tree next time: only merge with it when across_nodes. | ||
1372 | */ | ||
1373 | put_page(tree_page); | ||
1374 | return NULL; | ||
1140 | } else { | 1375 | } else { |
1141 | *tree_pagep = tree_page; | 1376 | *tree_pagep = tree_page; |
1142 | return tree_rmap_item; | 1377 | return tree_rmap_item; |
@@ -1145,8 +1380,9 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | |||
1145 | 1380 | ||
1146 | rmap_item->address |= UNSTABLE_FLAG; | 1381 | rmap_item->address |= UNSTABLE_FLAG; |
1147 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); | 1382 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); |
1383 | DO_NUMA(rmap_item->nid = nid); | ||
1148 | rb_link_node(&rmap_item->node, parent, new); | 1384 | rb_link_node(&rmap_item->node, parent, new); |
1149 | rb_insert_color(&rmap_item->node, &root_unstable_tree); | 1385 | rb_insert_color(&rmap_item->node, root); |
1150 | 1386 | ||
1151 | ksm_pages_unshared++; | 1387 | ksm_pages_unshared++; |
1152 | return NULL; | 1388 | return NULL; |
@@ -1188,10 +1424,29 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1188 | unsigned int checksum; | 1424 | unsigned int checksum; |
1189 | int err; | 1425 | int err; |
1190 | 1426 | ||
1191 | remove_rmap_item_from_tree(rmap_item); | 1427 | stable_node = page_stable_node(page); |
1428 | if (stable_node) { | ||
1429 | if (stable_node->head != &migrate_nodes && | ||
1430 | get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { | ||
1431 | rb_erase(&stable_node->node, | ||
1432 | root_stable_tree + NUMA(stable_node->nid)); | ||
1433 | stable_node->head = &migrate_nodes; | ||
1434 | list_add(&stable_node->list, stable_node->head); | ||
1435 | } | ||
1436 | if (stable_node->head != &migrate_nodes && | ||
1437 | rmap_item->head == stable_node) | ||
1438 | return; | ||
1439 | } | ||
1192 | 1440 | ||
1193 | /* We first start with searching the page inside the stable tree */ | 1441 | /* We first start with searching the page inside the stable tree */ |
1194 | kpage = stable_tree_search(page); | 1442 | kpage = stable_tree_search(page); |
1443 | if (kpage == page && rmap_item->head == stable_node) { | ||
1444 | put_page(kpage); | ||
1445 | return; | ||
1446 | } | ||
1447 | |||
1448 | remove_rmap_item_from_tree(rmap_item); | ||
1449 | |||
1195 | if (kpage) { | 1450 | if (kpage) { |
1196 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); | 1451 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
1197 | if (!err) { | 1452 | if (!err) { |
@@ -1225,14 +1480,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
1225 | kpage = try_to_merge_two_pages(rmap_item, page, | 1480 | kpage = try_to_merge_two_pages(rmap_item, page, |
1226 | tree_rmap_item, tree_page); | 1481 | tree_rmap_item, tree_page); |
1227 | put_page(tree_page); | 1482 | put_page(tree_page); |
1228 | /* | ||
1229 | * As soon as we merge this page, we want to remove the | ||
1230 | * rmap_item of the page we have merged with from the unstable | ||
1231 | * tree, and insert it instead as new node in the stable tree. | ||
1232 | */ | ||
1233 | if (kpage) { | 1483 | if (kpage) { |
1234 | remove_rmap_item_from_tree(tree_rmap_item); | 1484 | /* |
1235 | 1485 | * The pages were successfully merged: insert new | |
1486 | * node in the stable tree and add both rmap_items. | ||
1487 | */ | ||
1236 | lock_page(kpage); | 1488 | lock_page(kpage); |
1237 | stable_node = stable_tree_insert(kpage); | 1489 | stable_node = stable_tree_insert(kpage); |
1238 | if (stable_node) { | 1490 | if (stable_node) { |
@@ -1289,6 +1541,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1289 | struct mm_slot *slot; | 1541 | struct mm_slot *slot; |
1290 | struct vm_area_struct *vma; | 1542 | struct vm_area_struct *vma; |
1291 | struct rmap_item *rmap_item; | 1543 | struct rmap_item *rmap_item; |
1544 | int nid; | ||
1292 | 1545 | ||
1293 | if (list_empty(&ksm_mm_head.mm_list)) | 1546 | if (list_empty(&ksm_mm_head.mm_list)) |
1294 | return NULL; | 1547 | return NULL; |
@@ -1307,7 +1560,29 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1307 | */ | 1560 | */ |
1308 | lru_add_drain_all(); | 1561 | lru_add_drain_all(); |
1309 | 1562 | ||
1310 | root_unstable_tree = RB_ROOT; | 1563 | /* |
1564 | * Whereas stale stable_nodes on the stable_tree itself | ||
1565 | * get pruned in the regular course of stable_tree_search(), | ||
1566 | * those moved out to the migrate_nodes list can accumulate: | ||
1567 | * so prune them once before each full scan. | ||
1568 | */ | ||
1569 | if (!ksm_merge_across_nodes) { | ||
1570 | struct stable_node *stable_node; | ||
1571 | struct list_head *this, *next; | ||
1572 | struct page *page; | ||
1573 | |||
1574 | list_for_each_safe(this, next, &migrate_nodes) { | ||
1575 | stable_node = list_entry(this, | ||
1576 | struct stable_node, list); | ||
1577 | page = get_ksm_page(stable_node, false); | ||
1578 | if (page) | ||
1579 | put_page(page); | ||
1580 | cond_resched(); | ||
1581 | } | ||
1582 | } | ||
1583 | |||
1584 | for (nid = 0; nid < ksm_nr_node_ids; nid++) | ||
1585 | root_unstable_tree[nid] = RB_ROOT; | ||
1311 | 1586 | ||
1312 | spin_lock(&ksm_mmlist_lock); | 1587 | spin_lock(&ksm_mmlist_lock); |
1313 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); | 1588 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); |
@@ -1392,7 +1667,7 @@ next_mm: | |||
1392 | * or when all VM_MERGEABLE areas have been unmapped (and | 1667 | * or when all VM_MERGEABLE areas have been unmapped (and |
1393 | * mmap_sem then protects against race with MADV_MERGEABLE). | 1668 | * mmap_sem then protects against race with MADV_MERGEABLE). |
1394 | */ | 1669 | */ |
1395 | hlist_del(&slot->link); | 1670 | hash_del(&slot->link); |
1396 | list_del(&slot->mm_list); | 1671 | list_del(&slot->mm_list); |
1397 | spin_unlock(&ksm_mmlist_lock); | 1672 | spin_unlock(&ksm_mmlist_lock); |
1398 | 1673 | ||
@@ -1428,8 +1703,7 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
1428 | rmap_item = scan_get_next_rmap_item(&page); | 1703 | rmap_item = scan_get_next_rmap_item(&page); |
1429 | if (!rmap_item) | 1704 | if (!rmap_item) |
1430 | return; | 1705 | return; |
1431 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) | 1706 | cmp_and_merge_page(page, rmap_item); |
1432 | cmp_and_merge_page(page, rmap_item); | ||
1433 | put_page(page); | 1707 | put_page(page); |
1434 | } | 1708 | } |
1435 | } | 1709 | } |
@@ -1446,6 +1720,7 @@ static int ksm_scan_thread(void *nothing) | |||
1446 | 1720 | ||
1447 | while (!kthread_should_stop()) { | 1721 | while (!kthread_should_stop()) { |
1448 | mutex_lock(&ksm_thread_mutex); | 1722 | mutex_lock(&ksm_thread_mutex); |
1723 | wait_while_offlining(); | ||
1449 | if (ksmd_should_run()) | 1724 | if (ksmd_should_run()) |
1450 | ksm_do_scan(ksm_thread_pages_to_scan); | 1725 | ksm_do_scan(ksm_thread_pages_to_scan); |
1451 | mutex_unlock(&ksm_thread_mutex); | 1726 | mutex_unlock(&ksm_thread_mutex); |
@@ -1525,11 +1800,19 @@ int __ksm_enter(struct mm_struct *mm) | |||
1525 | spin_lock(&ksm_mmlist_lock); | 1800 | spin_lock(&ksm_mmlist_lock); |
1526 | insert_to_mm_slots_hash(mm, mm_slot); | 1801 | insert_to_mm_slots_hash(mm, mm_slot); |
1527 | /* | 1802 | /* |
1528 | * Insert just behind the scanning cursor, to let the area settle | 1803 | * When KSM_RUN_MERGE (or KSM_RUN_STOP), |
1804 | * insert just behind the scanning cursor, to let the area settle | ||
1529 | * down a little; when fork is followed by immediate exec, we don't | 1805 | * down a little; when fork is followed by immediate exec, we don't |
1530 | * want ksmd to waste time setting up and tearing down an rmap_list. | 1806 | * want ksmd to waste time setting up and tearing down an rmap_list. |
1807 | * | ||
1808 | * But when KSM_RUN_UNMERGE, it's important to insert ahead of its | ||
1809 | * scanning cursor, otherwise KSM pages in newly forked mms will be | ||
1810 | * missed: then we might as well insert at the end of the list. | ||
1531 | */ | 1811 | */ |
1532 | list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); | 1812 | if (ksm_run & KSM_RUN_UNMERGE) |
1813 | list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); | ||
1814 | else | ||
1815 | list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); | ||
1533 | spin_unlock(&ksm_mmlist_lock); | 1816 | spin_unlock(&ksm_mmlist_lock); |
1534 | 1817 | ||
1535 | set_bit(MMF_VM_MERGEABLE, &mm->flags); | 1818 | set_bit(MMF_VM_MERGEABLE, &mm->flags); |
@@ -1559,7 +1842,7 @@ void __ksm_exit(struct mm_struct *mm) | |||
1559 | mm_slot = get_mm_slot(mm); | 1842 | mm_slot = get_mm_slot(mm); |
1560 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { | 1843 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { |
1561 | if (!mm_slot->rmap_list) { | 1844 | if (!mm_slot->rmap_list) { |
1562 | hlist_del(&mm_slot->link); | 1845 | hash_del(&mm_slot->link); |
1563 | list_del(&mm_slot->mm_list); | 1846 | list_del(&mm_slot->mm_list); |
1564 | easy_to_free = 1; | 1847 | easy_to_free = 1; |
1565 | } else { | 1848 | } else { |
@@ -1579,24 +1862,32 @@ void __ksm_exit(struct mm_struct *mm) | |||
1579 | } | 1862 | } |
1580 | } | 1863 | } |
1581 | 1864 | ||
1582 | struct page *ksm_does_need_to_copy(struct page *page, | 1865 | struct page *ksm_might_need_to_copy(struct page *page, |
1583 | struct vm_area_struct *vma, unsigned long address) | 1866 | struct vm_area_struct *vma, unsigned long address) |
1584 | { | 1867 | { |
1868 | struct anon_vma *anon_vma = page_anon_vma(page); | ||
1585 | struct page *new_page; | 1869 | struct page *new_page; |
1586 | 1870 | ||
1871 | if (PageKsm(page)) { | ||
1872 | if (page_stable_node(page) && | ||
1873 | !(ksm_run & KSM_RUN_UNMERGE)) | ||
1874 | return page; /* no need to copy it */ | ||
1875 | } else if (!anon_vma) { | ||
1876 | return page; /* no need to copy it */ | ||
1877 | } else if (anon_vma->root == vma->anon_vma->root && | ||
1878 | page->index == linear_page_index(vma, address)) { | ||
1879 | return page; /* still no need to copy it */ | ||
1880 | } | ||
1881 | if (!PageUptodate(page)) | ||
1882 | return page; /* let do_swap_page report the error */ | ||
1883 | |||
1587 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 1884 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); |
1588 | if (new_page) { | 1885 | if (new_page) { |
1589 | copy_user_highpage(new_page, page, address, vma); | 1886 | copy_user_highpage(new_page, page, address, vma); |
1590 | 1887 | ||
1591 | SetPageDirty(new_page); | 1888 | SetPageDirty(new_page); |
1592 | __SetPageUptodate(new_page); | 1889 | __SetPageUptodate(new_page); |
1593 | SetPageSwapBacked(new_page); | ||
1594 | __set_page_locked(new_page); | 1890 | __set_page_locked(new_page); |
1595 | |||
1596 | if (!mlocked_vma_newpage(vma, new_page)) | ||
1597 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); | ||
1598 | else | ||
1599 | add_page_to_unevictable_list(new_page); | ||
1600 | } | 1891 | } |
1601 | 1892 | ||
1602 | return new_page; | 1893 | return new_page; |
@@ -1773,64 +2064,115 @@ void ksm_migrate_page(struct page *newpage, struct page *oldpage) | |||
1773 | if (stable_node) { | 2064 | if (stable_node) { |
1774 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); | 2065 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); |
1775 | stable_node->kpfn = page_to_pfn(newpage); | 2066 | stable_node->kpfn = page_to_pfn(newpage); |
2067 | /* | ||
2068 | * newpage->mapping was set in advance; now we need smp_wmb() | ||
2069 | * to make sure that the new stable_node->kpfn is visible | ||
2070 | * to get_ksm_page() before it can see that oldpage->mapping | ||
2071 | * has gone stale (or that PageSwapCache has been cleared). | ||
2072 | */ | ||
2073 | smp_wmb(); | ||
2074 | set_page_stable_node(oldpage, NULL); | ||
1776 | } | 2075 | } |
1777 | } | 2076 | } |
1778 | #endif /* CONFIG_MIGRATION */ | 2077 | #endif /* CONFIG_MIGRATION */ |
1779 | 2078 | ||
1780 | #ifdef CONFIG_MEMORY_HOTREMOVE | 2079 | #ifdef CONFIG_MEMORY_HOTREMOVE |
1781 | static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, | 2080 | static int just_wait(void *word) |
1782 | unsigned long end_pfn) | ||
1783 | { | 2081 | { |
1784 | struct rb_node *node; | 2082 | schedule(); |
2083 | return 0; | ||
2084 | } | ||
1785 | 2085 | ||
1786 | for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { | 2086 | static void wait_while_offlining(void) |
1787 | struct stable_node *stable_node; | 2087 | { |
2088 | while (ksm_run & KSM_RUN_OFFLINE) { | ||
2089 | mutex_unlock(&ksm_thread_mutex); | ||
2090 | wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE), | ||
2091 | just_wait, TASK_UNINTERRUPTIBLE); | ||
2092 | mutex_lock(&ksm_thread_mutex); | ||
2093 | } | ||
2094 | } | ||
1788 | 2095 | ||
1789 | stable_node = rb_entry(node, struct stable_node, node); | 2096 | static void ksm_check_stable_tree(unsigned long start_pfn, |
2097 | unsigned long end_pfn) | ||
2098 | { | ||
2099 | struct stable_node *stable_node; | ||
2100 | struct list_head *this, *next; | ||
2101 | struct rb_node *node; | ||
2102 | int nid; | ||
2103 | |||
2104 | for (nid = 0; nid < ksm_nr_node_ids; nid++) { | ||
2105 | node = rb_first(root_stable_tree + nid); | ||
2106 | while (node) { | ||
2107 | stable_node = rb_entry(node, struct stable_node, node); | ||
2108 | if (stable_node->kpfn >= start_pfn && | ||
2109 | stable_node->kpfn < end_pfn) { | ||
2110 | /* | ||
2111 | * Don't get_ksm_page, page has already gone: | ||
2112 | * which is why we keep kpfn instead of page* | ||
2113 | */ | ||
2114 | remove_node_from_stable_tree(stable_node); | ||
2115 | node = rb_first(root_stable_tree + nid); | ||
2116 | } else | ||
2117 | node = rb_next(node); | ||
2118 | cond_resched(); | ||
2119 | } | ||
2120 | } | ||
2121 | list_for_each_safe(this, next, &migrate_nodes) { | ||
2122 | stable_node = list_entry(this, struct stable_node, list); | ||
1790 | if (stable_node->kpfn >= start_pfn && | 2123 | if (stable_node->kpfn >= start_pfn && |
1791 | stable_node->kpfn < end_pfn) | 2124 | stable_node->kpfn < end_pfn) |
1792 | return stable_node; | 2125 | remove_node_from_stable_tree(stable_node); |
2126 | cond_resched(); | ||
1793 | } | 2127 | } |
1794 | return NULL; | ||
1795 | } | 2128 | } |
1796 | 2129 | ||
1797 | static int ksm_memory_callback(struct notifier_block *self, | 2130 | static int ksm_memory_callback(struct notifier_block *self, |
1798 | unsigned long action, void *arg) | 2131 | unsigned long action, void *arg) |
1799 | { | 2132 | { |
1800 | struct memory_notify *mn = arg; | 2133 | struct memory_notify *mn = arg; |
1801 | struct stable_node *stable_node; | ||
1802 | 2134 | ||
1803 | switch (action) { | 2135 | switch (action) { |
1804 | case MEM_GOING_OFFLINE: | 2136 | case MEM_GOING_OFFLINE: |
1805 | /* | 2137 | /* |
1806 | * Keep it very simple for now: just lock out ksmd and | 2138 | * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items() |
1807 | * MADV_UNMERGEABLE while any memory is going offline. | 2139 | * and remove_all_stable_nodes() while memory is going offline: |
1808 | * mutex_lock_nested() is necessary because lockdep was alarmed | 2140 | * it is unsafe for them to touch the stable tree at this time. |
1809 | * that here we take ksm_thread_mutex inside notifier chain | 2141 | * But unmerge_ksm_pages(), rmap lookups and other entry points |
1810 | * mutex, and later take notifier chain mutex inside | 2142 | * which do not need the ksm_thread_mutex are all safe. |
1811 | * ksm_thread_mutex to unlock it. But that's safe because both | ||
1812 | * are inside mem_hotplug_mutex. | ||
1813 | */ | 2143 | */ |
1814 | mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); | 2144 | mutex_lock(&ksm_thread_mutex); |
2145 | ksm_run |= KSM_RUN_OFFLINE; | ||
2146 | mutex_unlock(&ksm_thread_mutex); | ||
1815 | break; | 2147 | break; |
1816 | 2148 | ||
1817 | case MEM_OFFLINE: | 2149 | case MEM_OFFLINE: |
1818 | /* | 2150 | /* |
1819 | * Most of the work is done by page migration; but there might | 2151 | * Most of the work is done by page migration; but there might |
1820 | * be a few stable_nodes left over, still pointing to struct | 2152 | * be a few stable_nodes left over, still pointing to struct |
1821 | * pages which have been offlined: prune those from the tree. | 2153 | * pages which have been offlined: prune those from the tree, |
2154 | * otherwise get_ksm_page() might later try to access a | ||
2155 | * non-existent struct page. | ||
1822 | */ | 2156 | */ |
1823 | while ((stable_node = ksm_check_stable_tree(mn->start_pfn, | 2157 | ksm_check_stable_tree(mn->start_pfn, |
1824 | mn->start_pfn + mn->nr_pages)) != NULL) | 2158 | mn->start_pfn + mn->nr_pages); |
1825 | remove_node_from_stable_tree(stable_node); | ||
1826 | /* fallthrough */ | 2159 | /* fallthrough */ |
1827 | 2160 | ||
1828 | case MEM_CANCEL_OFFLINE: | 2161 | case MEM_CANCEL_OFFLINE: |
2162 | mutex_lock(&ksm_thread_mutex); | ||
2163 | ksm_run &= ~KSM_RUN_OFFLINE; | ||
1829 | mutex_unlock(&ksm_thread_mutex); | 2164 | mutex_unlock(&ksm_thread_mutex); |
2165 | |||
2166 | smp_mb(); /* wake_up_bit advises this */ | ||
2167 | wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE)); | ||
1830 | break; | 2168 | break; |
1831 | } | 2169 | } |
1832 | return NOTIFY_OK; | 2170 | return NOTIFY_OK; |
1833 | } | 2171 | } |
2172 | #else | ||
2173 | static void wait_while_offlining(void) | ||
2174 | { | ||
2175 | } | ||
1834 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 2176 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
1835 | 2177 | ||
1836 | #ifdef CONFIG_SYSFS | 2178 | #ifdef CONFIG_SYSFS |
@@ -1893,7 +2235,7 @@ KSM_ATTR(pages_to_scan); | |||
1893 | static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, | 2235 | static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, |
1894 | char *buf) | 2236 | char *buf) |
1895 | { | 2237 | { |
1896 | return sprintf(buf, "%u\n", ksm_run); | 2238 | return sprintf(buf, "%lu\n", ksm_run); |
1897 | } | 2239 | } |
1898 | 2240 | ||
1899 | static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | 2241 | static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, |
@@ -1916,6 +2258,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1916 | */ | 2258 | */ |
1917 | 2259 | ||
1918 | mutex_lock(&ksm_thread_mutex); | 2260 | mutex_lock(&ksm_thread_mutex); |
2261 | wait_while_offlining(); | ||
1919 | if (ksm_run != flags) { | 2262 | if (ksm_run != flags) { |
1920 | ksm_run = flags; | 2263 | ksm_run = flags; |
1921 | if (flags & KSM_RUN_UNMERGE) { | 2264 | if (flags & KSM_RUN_UNMERGE) { |
@@ -1937,6 +2280,64 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
1937 | } | 2280 | } |
1938 | KSM_ATTR(run); | 2281 | KSM_ATTR(run); |
1939 | 2282 | ||
2283 | #ifdef CONFIG_NUMA | ||
2284 | static ssize_t merge_across_nodes_show(struct kobject *kobj, | ||
2285 | struct kobj_attribute *attr, char *buf) | ||
2286 | { | ||
2287 | return sprintf(buf, "%u\n", ksm_merge_across_nodes); | ||
2288 | } | ||
2289 | |||
2290 | static ssize_t merge_across_nodes_store(struct kobject *kobj, | ||
2291 | struct kobj_attribute *attr, | ||
2292 | const char *buf, size_t count) | ||
2293 | { | ||
2294 | int err; | ||
2295 | unsigned long knob; | ||
2296 | |||
2297 | err = kstrtoul(buf, 10, &knob); | ||
2298 | if (err) | ||
2299 | return err; | ||
2300 | if (knob > 1) | ||
2301 | return -EINVAL; | ||
2302 | |||
2303 | mutex_lock(&ksm_thread_mutex); | ||
2304 | wait_while_offlining(); | ||
2305 | if (ksm_merge_across_nodes != knob) { | ||
2306 | if (ksm_pages_shared || remove_all_stable_nodes()) | ||
2307 | err = -EBUSY; | ||
2308 | else if (root_stable_tree == one_stable_tree) { | ||
2309 | struct rb_root *buf; | ||
2310 | /* | ||
2311 | * This is the first time that we switch away from the | ||
2312 | * default of merging across nodes: must now allocate | ||
2313 | * a buffer to hold as many roots as may be needed. | ||
2314 | * Allocate stable and unstable together: | ||
2315 | * MAXSMP NODES_SHIFT 10 will use 16kB. | ||
2316 | */ | ||
2317 | buf = kcalloc(nr_node_ids + nr_node_ids, | ||
2318 | sizeof(*buf), GFP_KERNEL | __GFP_ZERO); | ||
2319 | /* Let us assume that RB_ROOT is NULL is zero */ | ||
2320 | if (!buf) | ||
2321 | err = -ENOMEM; | ||
2322 | else { | ||
2323 | root_stable_tree = buf; | ||
2324 | root_unstable_tree = buf + nr_node_ids; | ||
2325 | /* Stable tree is empty but not the unstable */ | ||
2326 | root_unstable_tree[0] = one_unstable_tree[0]; | ||
2327 | } | ||
2328 | } | ||
2329 | if (!err) { | ||
2330 | ksm_merge_across_nodes = knob; | ||
2331 | ksm_nr_node_ids = knob ? 1 : nr_node_ids; | ||
2332 | } | ||
2333 | } | ||
2334 | mutex_unlock(&ksm_thread_mutex); | ||
2335 | |||
2336 | return err ? err : count; | ||
2337 | } | ||
2338 | KSM_ATTR(merge_across_nodes); | ||
2339 | #endif | ||
2340 | |||
1940 | static ssize_t pages_shared_show(struct kobject *kobj, | 2341 | static ssize_t pages_shared_show(struct kobject *kobj, |
1941 | struct kobj_attribute *attr, char *buf) | 2342 | struct kobj_attribute *attr, char *buf) |
1942 | { | 2343 | { |
@@ -1991,6 +2392,9 @@ static struct attribute *ksm_attrs[] = { | |||
1991 | &pages_unshared_attr.attr, | 2392 | &pages_unshared_attr.attr, |
1992 | &pages_volatile_attr.attr, | 2393 | &pages_volatile_attr.attr, |
1993 | &full_scans_attr.attr, | 2394 | &full_scans_attr.attr, |
2395 | #ifdef CONFIG_NUMA | ||
2396 | &merge_across_nodes_attr.attr, | ||
2397 | #endif | ||
1994 | NULL, | 2398 | NULL, |
1995 | }; | 2399 | }; |
1996 | 2400 | ||
@@ -2029,10 +2433,7 @@ static int __init ksm_init(void) | |||
2029 | #endif /* CONFIG_SYSFS */ | 2433 | #endif /* CONFIG_SYSFS */ |
2030 | 2434 | ||
2031 | #ifdef CONFIG_MEMORY_HOTREMOVE | 2435 | #ifdef CONFIG_MEMORY_HOTREMOVE |
2032 | /* | 2436 | /* There is no significance to this priority 100 */ |
2033 | * Choose a high priority since the callback takes ksm_thread_mutex: | ||
2034 | * later callbacks could only be taking locks which nest within that. | ||
2035 | */ | ||
2036 | hotplug_memory_notifier(ksm_memory_callback, 100); | 2437 | hotplug_memory_notifier(ksm_memory_callback, 100); |
2037 | #endif | 2438 | #endif |
2038 | return 0; | 2439 | return 0; |
diff --git a/mm/madvise.c b/mm/madvise.c index 03dfa5c7adb3..c58c94b56c3d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -16,6 +16,9 @@ | |||
16 | #include <linux/ksm.h> | 16 | #include <linux/ksm.h> |
17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
18 | #include <linux/file.h> | 18 | #include <linux/file.h> |
19 | #include <linux/blkdev.h> | ||
20 | #include <linux/swap.h> | ||
21 | #include <linux/swapops.h> | ||
19 | 22 | ||
20 | /* | 23 | /* |
21 | * Any behaviour which results in changes to the vma->vm_flags needs to | 24 | * Any behaviour which results in changes to the vma->vm_flags needs to |
@@ -131,6 +134,84 @@ out: | |||
131 | return error; | 134 | return error; |
132 | } | 135 | } |
133 | 136 | ||
137 | #ifdef CONFIG_SWAP | ||
138 | static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, | ||
139 | unsigned long end, struct mm_walk *walk) | ||
140 | { | ||
141 | pte_t *orig_pte; | ||
142 | struct vm_area_struct *vma = walk->private; | ||
143 | unsigned long index; | ||
144 | |||
145 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
146 | return 0; | ||
147 | |||
148 | for (index = start; index != end; index += PAGE_SIZE) { | ||
149 | pte_t pte; | ||
150 | swp_entry_t entry; | ||
151 | struct page *page; | ||
152 | spinlock_t *ptl; | ||
153 | |||
154 | orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); | ||
155 | pte = *(orig_pte + ((index - start) / PAGE_SIZE)); | ||
156 | pte_unmap_unlock(orig_pte, ptl); | ||
157 | |||
158 | if (pte_present(pte) || pte_none(pte) || pte_file(pte)) | ||
159 | continue; | ||
160 | entry = pte_to_swp_entry(pte); | ||
161 | if (unlikely(non_swap_entry(entry))) | ||
162 | continue; | ||
163 | |||
164 | page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, | ||
165 | vma, index); | ||
166 | if (page) | ||
167 | page_cache_release(page); | ||
168 | } | ||
169 | |||
170 | return 0; | ||
171 | } | ||
172 | |||
173 | static void force_swapin_readahead(struct vm_area_struct *vma, | ||
174 | unsigned long start, unsigned long end) | ||
175 | { | ||
176 | struct mm_walk walk = { | ||
177 | .mm = vma->vm_mm, | ||
178 | .pmd_entry = swapin_walk_pmd_entry, | ||
179 | .private = vma, | ||
180 | }; | ||
181 | |||
182 | walk_page_range(start, end, &walk); | ||
183 | |||
184 | lru_add_drain(); /* Push any new pages onto the LRU now */ | ||
185 | } | ||
186 | |||
187 | static void force_shm_swapin_readahead(struct vm_area_struct *vma, | ||
188 | unsigned long start, unsigned long end, | ||
189 | struct address_space *mapping) | ||
190 | { | ||
191 | pgoff_t index; | ||
192 | struct page *page; | ||
193 | swp_entry_t swap; | ||
194 | |||
195 | for (; start < end; start += PAGE_SIZE) { | ||
196 | index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | ||
197 | |||
198 | page = find_get_page(mapping, index); | ||
199 | if (!radix_tree_exceptional_entry(page)) { | ||
200 | if (page) | ||
201 | page_cache_release(page); | ||
202 | continue; | ||
203 | } | ||
204 | swap = radix_to_swp_entry(page); | ||
205 | page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, | ||
206 | NULL, 0); | ||
207 | if (page) | ||
208 | page_cache_release(page); | ||
209 | } | ||
210 | |||
211 | lru_add_drain(); /* Push any new pages onto the LRU now */ | ||
212 | } | ||
213 | #endif /* CONFIG_SWAP */ | ||
214 | |||
134 | /* | 215 | /* |
135 | * Schedule all required I/O operations. Do not wait for completion. | 216 | * Schedule all required I/O operations. Do not wait for completion. |
136 | */ | 217 | */ |
@@ -140,6 +221,18 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
140 | { | 221 | { |
141 | struct file *file = vma->vm_file; | 222 | struct file *file = vma->vm_file; |
142 | 223 | ||
224 | #ifdef CONFIG_SWAP | ||
225 | if (!file || mapping_cap_swap_backed(file->f_mapping)) { | ||
226 | *prev = vma; | ||
227 | if (!file) | ||
228 | force_swapin_readahead(vma, start, end); | ||
229 | else | ||
230 | force_shm_swapin_readahead(vma, start, end, | ||
231 | file->f_mapping); | ||
232 | return 0; | ||
233 | } | ||
234 | #endif | ||
235 | |||
143 | if (!file) | 236 | if (!file) |
144 | return -EBADF; | 237 | return -EBADF; |
145 | 238 | ||
@@ -371,6 +464,7 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
371 | int error = -EINVAL; | 464 | int error = -EINVAL; |
372 | int write; | 465 | int write; |
373 | size_t len; | 466 | size_t len; |
467 | struct blk_plug plug; | ||
374 | 468 | ||
375 | #ifdef CONFIG_MEMORY_FAILURE | 469 | #ifdef CONFIG_MEMORY_FAILURE |
376 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) | 470 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) |
@@ -410,18 +504,19 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
410 | if (vma && start > vma->vm_start) | 504 | if (vma && start > vma->vm_start) |
411 | prev = vma; | 505 | prev = vma; |
412 | 506 | ||
507 | blk_start_plug(&plug); | ||
413 | for (;;) { | 508 | for (;;) { |
414 | /* Still start < end. */ | 509 | /* Still start < end. */ |
415 | error = -ENOMEM; | 510 | error = -ENOMEM; |
416 | if (!vma) | 511 | if (!vma) |
417 | goto out; | 512 | goto out_plug; |
418 | 513 | ||
419 | /* Here start < (end|vma->vm_end). */ | 514 | /* Here start < (end|vma->vm_end). */ |
420 | if (start < vma->vm_start) { | 515 | if (start < vma->vm_start) { |
421 | unmapped_error = -ENOMEM; | 516 | unmapped_error = -ENOMEM; |
422 | start = vma->vm_start; | 517 | start = vma->vm_start; |
423 | if (start >= end) | 518 | if (start >= end) |
424 | goto out; | 519 | goto out_plug; |
425 | } | 520 | } |
426 | 521 | ||
427 | /* Here vma->vm_start <= start < (end|vma->vm_end) */ | 522 | /* Here vma->vm_start <= start < (end|vma->vm_end) */ |
@@ -432,18 +527,20 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
432 | /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ | 527 | /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ |
433 | error = madvise_vma(vma, &prev, start, tmp, behavior); | 528 | error = madvise_vma(vma, &prev, start, tmp, behavior); |
434 | if (error) | 529 | if (error) |
435 | goto out; | 530 | goto out_plug; |
436 | start = tmp; | 531 | start = tmp; |
437 | if (prev && start < prev->vm_end) | 532 | if (prev && start < prev->vm_end) |
438 | start = prev->vm_end; | 533 | start = prev->vm_end; |
439 | error = unmapped_error; | 534 | error = unmapped_error; |
440 | if (start >= end) | 535 | if (start >= end) |
441 | goto out; | 536 | goto out_plug; |
442 | if (prev) | 537 | if (prev) |
443 | vma = prev->vm_next; | 538 | vma = prev->vm_next; |
444 | else /* madvise_remove dropped mmap_sem */ | 539 | else /* madvise_remove dropped mmap_sem */ |
445 | vma = find_vma(current->mm, start); | 540 | vma = find_vma(current->mm, start); |
446 | } | 541 | } |
542 | out_plug: | ||
543 | blk_finish_plug(&plug); | ||
447 | out: | 544 | out: |
448 | if (write) | 545 | if (write) |
449 | up_write(¤t->mm->mmap_sem); | 546 | up_write(¤t->mm->mmap_sem); |
diff --git a/mm/memblock.c b/mm/memblock.c index b8d9147e5c08..1bcd9b970564 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -92,9 +92,58 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, | |||
92 | * | 92 | * |
93 | * Find @size free area aligned to @align in the specified range and node. | 93 | * Find @size free area aligned to @align in the specified range and node. |
94 | * | 94 | * |
95 | * If we have CONFIG_HAVE_MEMBLOCK_NODE_MAP defined, we need to check if the | ||
96 | * memory we found if not in hotpluggable ranges. | ||
97 | * | ||
95 | * RETURNS: | 98 | * RETURNS: |
96 | * Found address on success, %0 on failure. | 99 | * Found address on success, %0 on failure. |
97 | */ | 100 | */ |
101 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
102 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | ||
103 | phys_addr_t end, phys_addr_t size, | ||
104 | phys_addr_t align, int nid) | ||
105 | { | ||
106 | phys_addr_t this_start, this_end, cand; | ||
107 | u64 i; | ||
108 | int curr = movablemem_map.nr_map - 1; | ||
109 | |||
110 | /* pump up @end */ | ||
111 | if (end == MEMBLOCK_ALLOC_ACCESSIBLE) | ||
112 | end = memblock.current_limit; | ||
113 | |||
114 | /* avoid allocating the first page */ | ||
115 | start = max_t(phys_addr_t, start, PAGE_SIZE); | ||
116 | end = max(start, end); | ||
117 | |||
118 | for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { | ||
119 | this_start = clamp(this_start, start, end); | ||
120 | this_end = clamp(this_end, start, end); | ||
121 | |||
122 | restart: | ||
123 | if (this_end <= this_start || this_end < size) | ||
124 | continue; | ||
125 | |||
126 | for (; curr >= 0; curr--) { | ||
127 | if ((movablemem_map.map[curr].start_pfn << PAGE_SHIFT) | ||
128 | < this_end) | ||
129 | break; | ||
130 | } | ||
131 | |||
132 | cand = round_down(this_end - size, align); | ||
133 | if (curr >= 0 && | ||
134 | cand < movablemem_map.map[curr].end_pfn << PAGE_SHIFT) { | ||
135 | this_end = movablemem_map.map[curr].start_pfn | ||
136 | << PAGE_SHIFT; | ||
137 | goto restart; | ||
138 | } | ||
139 | |||
140 | if (cand >= this_start) | ||
141 | return cand; | ||
142 | } | ||
143 | |||
144 | return 0; | ||
145 | } | ||
146 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | ||
98 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | 147 | phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, |
99 | phys_addr_t end, phys_addr_t size, | 148 | phys_addr_t end, phys_addr_t size, |
100 | phys_addr_t align, int nid) | 149 | phys_addr_t align, int nid) |
@@ -123,6 +172,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, | |||
123 | } | 172 | } |
124 | return 0; | 173 | return 0; |
125 | } | 174 | } |
175 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | ||
126 | 176 | ||
127 | /** | 177 | /** |
128 | * memblock_find_in_range - find free area in given range | 178 | * memblock_find_in_range - find free area in given range |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fbb60b103e64..53b8201b31eb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -120,6 +120,14 @@ static const char * const mem_cgroup_events_names[] = { | |||
120 | "pgmajfault", | 120 | "pgmajfault", |
121 | }; | 121 | }; |
122 | 122 | ||
123 | static const char * const mem_cgroup_lru_names[] = { | ||
124 | "inactive_anon", | ||
125 | "active_anon", | ||
126 | "inactive_file", | ||
127 | "active_file", | ||
128 | "unevictable", | ||
129 | }; | ||
130 | |||
123 | /* | 131 | /* |
124 | * Per memcg event counter is incremented at every pagein/pageout. With THP, | 132 | * Per memcg event counter is incremented at every pagein/pageout. With THP, |
125 | * it will be incremated by the number of pages. This counter is used for | 133 | * it will be incremated by the number of pages. This counter is used for |
@@ -172,7 +180,7 @@ struct mem_cgroup_per_node { | |||
172 | }; | 180 | }; |
173 | 181 | ||
174 | struct mem_cgroup_lru_info { | 182 | struct mem_cgroup_lru_info { |
175 | struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; | 183 | struct mem_cgroup_per_node *nodeinfo[0]; |
176 | }; | 184 | }; |
177 | 185 | ||
178 | /* | 186 | /* |
@@ -276,17 +284,6 @@ struct mem_cgroup { | |||
276 | */ | 284 | */ |
277 | struct res_counter kmem; | 285 | struct res_counter kmem; |
278 | /* | 286 | /* |
279 | * Per cgroup active and inactive list, similar to the | ||
280 | * per zone LRU lists. | ||
281 | */ | ||
282 | struct mem_cgroup_lru_info info; | ||
283 | int last_scanned_node; | ||
284 | #if MAX_NUMNODES > 1 | ||
285 | nodemask_t scan_nodes; | ||
286 | atomic_t numainfo_events; | ||
287 | atomic_t numainfo_updating; | ||
288 | #endif | ||
289 | /* | ||
290 | * Should the accounting and control be hierarchical, per subtree? | 287 | * Should the accounting and control be hierarchical, per subtree? |
291 | */ | 288 | */ |
292 | bool use_hierarchy; | 289 | bool use_hierarchy; |
@@ -349,8 +346,29 @@ struct mem_cgroup { | |||
349 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ | 346 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ |
350 | int kmemcg_id; | 347 | int kmemcg_id; |
351 | #endif | 348 | #endif |
349 | |||
350 | int last_scanned_node; | ||
351 | #if MAX_NUMNODES > 1 | ||
352 | nodemask_t scan_nodes; | ||
353 | atomic_t numainfo_events; | ||
354 | atomic_t numainfo_updating; | ||
355 | #endif | ||
356 | /* | ||
357 | * Per cgroup active and inactive list, similar to the | ||
358 | * per zone LRU lists. | ||
359 | * | ||
360 | * WARNING: This has to be the last element of the struct. Don't | ||
361 | * add new fields after this point. | ||
362 | */ | ||
363 | struct mem_cgroup_lru_info info; | ||
352 | }; | 364 | }; |
353 | 365 | ||
366 | static size_t memcg_size(void) | ||
367 | { | ||
368 | return sizeof(struct mem_cgroup) + | ||
369 | nr_node_ids * sizeof(struct mem_cgroup_per_node); | ||
370 | } | ||
371 | |||
354 | /* internal only representation about the status of kmem accounting. */ | 372 | /* internal only representation about the status of kmem accounting. */ |
355 | enum { | 373 | enum { |
356 | KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ | 374 | KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ |
@@ -398,8 +416,8 @@ static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) | |||
398 | 416 | ||
399 | /* Stuffs for move charges at task migration. */ | 417 | /* Stuffs for move charges at task migration. */ |
400 | /* | 418 | /* |
401 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | 419 | * Types of charges to be moved. "move_charge_at_immitgrate" and |
402 | * left-shifted bitmap of these types. | 420 | * "immigrate_flags" are treated as a left-shifted bitmap of these types. |
403 | */ | 421 | */ |
404 | enum move_type { | 422 | enum move_type { |
405 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | 423 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ |
@@ -412,6 +430,7 @@ static struct move_charge_struct { | |||
412 | spinlock_t lock; /* for from, to */ | 430 | spinlock_t lock; /* for from, to */ |
413 | struct mem_cgroup *from; | 431 | struct mem_cgroup *from; |
414 | struct mem_cgroup *to; | 432 | struct mem_cgroup *to; |
433 | unsigned long immigrate_flags; | ||
415 | unsigned long precharge; | 434 | unsigned long precharge; |
416 | unsigned long moved_charge; | 435 | unsigned long moved_charge; |
417 | unsigned long moved_swap; | 436 | unsigned long moved_swap; |
@@ -424,14 +443,12 @@ static struct move_charge_struct { | |||
424 | 443 | ||
425 | static bool move_anon(void) | 444 | static bool move_anon(void) |
426 | { | 445 | { |
427 | return test_bit(MOVE_CHARGE_TYPE_ANON, | 446 | return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); |
428 | &mc.to->move_charge_at_immigrate); | ||
429 | } | 447 | } |
430 | 448 | ||
431 | static bool move_file(void) | 449 | static bool move_file(void) |
432 | { | 450 | { |
433 | return test_bit(MOVE_CHARGE_TYPE_FILE, | 451 | return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); |
434 | &mc.to->move_charge_at_immigrate); | ||
435 | } | 452 | } |
436 | 453 | ||
437 | /* | 454 | /* |
@@ -471,6 +488,13 @@ enum res_type { | |||
471 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | 488 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 |
472 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | 489 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) |
473 | 490 | ||
491 | /* | ||
492 | * The memcg_create_mutex will be held whenever a new cgroup is created. | ||
493 | * As a consequence, any change that needs to protect against new child cgroups | ||
494 | * appearing has to hold it as well. | ||
495 | */ | ||
496 | static DEFINE_MUTEX(memcg_create_mutex); | ||
497 | |||
474 | static void mem_cgroup_get(struct mem_cgroup *memcg); | 498 | static void mem_cgroup_get(struct mem_cgroup *memcg); |
475 | static void mem_cgroup_put(struct mem_cgroup *memcg); | 499 | static void mem_cgroup_put(struct mem_cgroup *memcg); |
476 | 500 | ||
@@ -627,6 +651,7 @@ static void drain_all_stock_async(struct mem_cgroup *memcg); | |||
627 | static struct mem_cgroup_per_zone * | 651 | static struct mem_cgroup_per_zone * |
628 | mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) | 652 | mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) |
629 | { | 653 | { |
654 | VM_BUG_ON((unsigned)nid >= nr_node_ids); | ||
630 | return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; | 655 | return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; |
631 | } | 656 | } |
632 | 657 | ||
@@ -1371,17 +1396,6 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
1371 | return inactive * inactive_ratio < active; | 1396 | return inactive * inactive_ratio < active; |
1372 | } | 1397 | } |
1373 | 1398 | ||
1374 | int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) | ||
1375 | { | ||
1376 | unsigned long active; | ||
1377 | unsigned long inactive; | ||
1378 | |||
1379 | inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE); | ||
1380 | active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE); | ||
1381 | |||
1382 | return (active > inactive); | ||
1383 | } | ||
1384 | |||
1385 | #define mem_cgroup_from_res_counter(counter, member) \ | 1399 | #define mem_cgroup_from_res_counter(counter, member) \ |
1386 | container_of(counter, struct mem_cgroup, member) | 1400 | container_of(counter, struct mem_cgroup, member) |
1387 | 1401 | ||
@@ -1524,8 +1538,9 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, | |||
1524 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | 1538 | spin_unlock_irqrestore(&memcg->move_lock, *flags); |
1525 | } | 1539 | } |
1526 | 1540 | ||
1541 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
1527 | /** | 1542 | /** |
1528 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. | 1543 | * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. |
1529 | * @memcg: The memory cgroup that went over limit | 1544 | * @memcg: The memory cgroup that went over limit |
1530 | * @p: Task that is going to be killed | 1545 | * @p: Task that is going to be killed |
1531 | * | 1546 | * |
@@ -1543,8 +1558,10 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1543 | */ | 1558 | */ |
1544 | static char memcg_name[PATH_MAX]; | 1559 | static char memcg_name[PATH_MAX]; |
1545 | int ret; | 1560 | int ret; |
1561 | struct mem_cgroup *iter; | ||
1562 | unsigned int i; | ||
1546 | 1563 | ||
1547 | if (!memcg || !p) | 1564 | if (!p) |
1548 | return; | 1565 | return; |
1549 | 1566 | ||
1550 | rcu_read_lock(); | 1567 | rcu_read_lock(); |
@@ -1563,7 +1580,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1563 | } | 1580 | } |
1564 | rcu_read_unlock(); | 1581 | rcu_read_unlock(); |
1565 | 1582 | ||
1566 | printk(KERN_INFO "Task in %s killed", memcg_name); | 1583 | pr_info("Task in %s killed", memcg_name); |
1567 | 1584 | ||
1568 | rcu_read_lock(); | 1585 | rcu_read_lock(); |
1569 | ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); | 1586 | ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); |
@@ -1576,22 +1593,45 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1576 | /* | 1593 | /* |
1577 | * Continues from above, so we don't need an KERN_ level | 1594 | * Continues from above, so we don't need an KERN_ level |
1578 | */ | 1595 | */ |
1579 | printk(KERN_CONT " as a result of limit of %s\n", memcg_name); | 1596 | pr_cont(" as a result of limit of %s\n", memcg_name); |
1580 | done: | 1597 | done: |
1581 | 1598 | ||
1582 | printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", | 1599 | pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", |
1583 | res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, | 1600 | res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, |
1584 | res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, | 1601 | res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, |
1585 | res_counter_read_u64(&memcg->res, RES_FAILCNT)); | 1602 | res_counter_read_u64(&memcg->res, RES_FAILCNT)); |
1586 | printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " | 1603 | pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", |
1587 | "failcnt %llu\n", | ||
1588 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, | 1604 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, |
1589 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, | 1605 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, |
1590 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); | 1606 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); |
1591 | printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n", | 1607 | pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", |
1592 | res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, | 1608 | res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, |
1593 | res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, | 1609 | res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, |
1594 | res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); | 1610 | res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); |
1611 | |||
1612 | for_each_mem_cgroup_tree(iter, memcg) { | ||
1613 | pr_info("Memory cgroup stats"); | ||
1614 | |||
1615 | rcu_read_lock(); | ||
1616 | ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); | ||
1617 | if (!ret) | ||
1618 | pr_cont(" for %s", memcg_name); | ||
1619 | rcu_read_unlock(); | ||
1620 | pr_cont(":"); | ||
1621 | |||
1622 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | ||
1623 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) | ||
1624 | continue; | ||
1625 | pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], | ||
1626 | K(mem_cgroup_read_stat(iter, i))); | ||
1627 | } | ||
1628 | |||
1629 | for (i = 0; i < NR_LRU_LISTS; i++) | ||
1630 | pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], | ||
1631 | K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); | ||
1632 | |||
1633 | pr_cont("\n"); | ||
1634 | } | ||
1595 | } | 1635 | } |
1596 | 1636 | ||
1597 | /* | 1637 | /* |
@@ -2256,6 +2296,17 @@ static void drain_local_stock(struct work_struct *dummy) | |||
2256 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); | 2296 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); |
2257 | } | 2297 | } |
2258 | 2298 | ||
2299 | static void __init memcg_stock_init(void) | ||
2300 | { | ||
2301 | int cpu; | ||
2302 | |||
2303 | for_each_possible_cpu(cpu) { | ||
2304 | struct memcg_stock_pcp *stock = | ||
2305 | &per_cpu(memcg_stock, cpu); | ||
2306 | INIT_WORK(&stock->work, drain_local_stock); | ||
2307 | } | ||
2308 | } | ||
2309 | |||
2259 | /* | 2310 | /* |
2260 | * Cache charges(val) which is from res_counter, to local per_cpu area. | 2311 | * Cache charges(val) which is from res_counter, to local per_cpu area. |
2261 | * This will be consumed by consume_stock() function, later. | 2312 | * This will be consumed by consume_stock() function, later. |
@@ -4391,8 +4442,8 @@ void mem_cgroup_print_bad_page(struct page *page) | |||
4391 | 4442 | ||
4392 | pc = lookup_page_cgroup_used(page); | 4443 | pc = lookup_page_cgroup_used(page); |
4393 | if (pc) { | 4444 | if (pc) { |
4394 | printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", | 4445 | pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", |
4395 | pc, pc->flags, pc->mem_cgroup); | 4446 | pc, pc->flags, pc->mem_cgroup); |
4396 | } | 4447 | } |
4397 | } | 4448 | } |
4398 | #endif | 4449 | #endif |
@@ -4719,6 +4770,33 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) | |||
4719 | } | 4770 | } |
4720 | 4771 | ||
4721 | /* | 4772 | /* |
4773 | * This mainly exists for tests during the setting of set of use_hierarchy. | ||
4774 | * Since this is the very setting we are changing, the current hierarchy value | ||
4775 | * is meaningless | ||
4776 | */ | ||
4777 | static inline bool __memcg_has_children(struct mem_cgroup *memcg) | ||
4778 | { | ||
4779 | struct cgroup *pos; | ||
4780 | |||
4781 | /* bounce at first found */ | ||
4782 | cgroup_for_each_child(pos, memcg->css.cgroup) | ||
4783 | return true; | ||
4784 | return false; | ||
4785 | } | ||
4786 | |||
4787 | /* | ||
4788 | * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed | ||
4789 | * to be already dead (as in mem_cgroup_force_empty, for instance). This is | ||
4790 | * from mem_cgroup_count_children(), in the sense that we don't really care how | ||
4791 | * many children we have; we only need to know if we have any. It also counts | ||
4792 | * any memcg without hierarchy as infertile. | ||
4793 | */ | ||
4794 | static inline bool memcg_has_children(struct mem_cgroup *memcg) | ||
4795 | { | ||
4796 | return memcg->use_hierarchy && __memcg_has_children(memcg); | ||
4797 | } | ||
4798 | |||
4799 | /* | ||
4722 | * Reclaims as many pages from the given memcg as possible and moves | 4800 | * Reclaims as many pages from the given memcg as possible and moves |
4723 | * the rest to the parent. | 4801 | * the rest to the parent. |
4724 | * | 4802 | * |
@@ -4788,7 +4866,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
4788 | if (parent) | 4866 | if (parent) |
4789 | parent_memcg = mem_cgroup_from_cont(parent); | 4867 | parent_memcg = mem_cgroup_from_cont(parent); |
4790 | 4868 | ||
4791 | cgroup_lock(); | 4869 | mutex_lock(&memcg_create_mutex); |
4792 | 4870 | ||
4793 | if (memcg->use_hierarchy == val) | 4871 | if (memcg->use_hierarchy == val) |
4794 | goto out; | 4872 | goto out; |
@@ -4803,7 +4881,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
4803 | */ | 4881 | */ |
4804 | if ((!parent_memcg || !parent_memcg->use_hierarchy) && | 4882 | if ((!parent_memcg || !parent_memcg->use_hierarchy) && |
4805 | (val == 1 || val == 0)) { | 4883 | (val == 1 || val == 0)) { |
4806 | if (list_empty(&cont->children)) | 4884 | if (!__memcg_has_children(memcg)) |
4807 | memcg->use_hierarchy = val; | 4885 | memcg->use_hierarchy = val; |
4808 | else | 4886 | else |
4809 | retval = -EBUSY; | 4887 | retval = -EBUSY; |
@@ -4811,7 +4889,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
4811 | retval = -EINVAL; | 4889 | retval = -EINVAL; |
4812 | 4890 | ||
4813 | out: | 4891 | out: |
4814 | cgroup_unlock(); | 4892 | mutex_unlock(&memcg_create_mutex); |
4815 | 4893 | ||
4816 | return retval; | 4894 | return retval; |
4817 | } | 4895 | } |
@@ -4896,8 +4974,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
4896 | { | 4974 | { |
4897 | int ret = -EINVAL; | 4975 | int ret = -EINVAL; |
4898 | #ifdef CONFIG_MEMCG_KMEM | 4976 | #ifdef CONFIG_MEMCG_KMEM |
4899 | bool must_inc_static_branch = false; | ||
4900 | |||
4901 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 4977 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4902 | /* | 4978 | /* |
4903 | * For simplicity, we won't allow this to be disabled. It also can't | 4979 | * For simplicity, we won't allow this to be disabled. It also can't |
@@ -4910,18 +4986,11 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
4910 | * | 4986 | * |
4911 | * After it first became limited, changes in the value of the limit are | 4987 | * After it first became limited, changes in the value of the limit are |
4912 | * of course permitted. | 4988 | * of course permitted. |
4913 | * | ||
4914 | * Taking the cgroup_lock is really offensive, but it is so far the only | ||
4915 | * way to guarantee that no children will appear. There are plenty of | ||
4916 | * other offenders, and they should all go away. Fine grained locking | ||
4917 | * is probably the way to go here. When we are fully hierarchical, we | ||
4918 | * can also get rid of the use_hierarchy check. | ||
4919 | */ | 4989 | */ |
4920 | cgroup_lock(); | 4990 | mutex_lock(&memcg_create_mutex); |
4921 | mutex_lock(&set_limit_mutex); | 4991 | mutex_lock(&set_limit_mutex); |
4922 | if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { | 4992 | if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { |
4923 | if (cgroup_task_count(cont) || (memcg->use_hierarchy && | 4993 | if (cgroup_task_count(cont) || memcg_has_children(memcg)) { |
4924 | !list_empty(&cont->children))) { | ||
4925 | ret = -EBUSY; | 4994 | ret = -EBUSY; |
4926 | goto out; | 4995 | goto out; |
4927 | } | 4996 | } |
@@ -4933,7 +5002,13 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
4933 | res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); | 5002 | res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); |
4934 | goto out; | 5003 | goto out; |
4935 | } | 5004 | } |
4936 | must_inc_static_branch = true; | 5005 | static_key_slow_inc(&memcg_kmem_enabled_key); |
5006 | /* | ||
5007 | * setting the active bit after the inc will guarantee no one | ||
5008 | * starts accounting before all call sites are patched | ||
5009 | */ | ||
5010 | memcg_kmem_set_active(memcg); | ||
5011 | |||
4937 | /* | 5012 | /* |
4938 | * kmem charges can outlive the cgroup. In the case of slab | 5013 | * kmem charges can outlive the cgroup. In the case of slab |
4939 | * pages, for instance, a page contain objects from various | 5014 | * pages, for instance, a page contain objects from various |
@@ -4945,32 +5020,12 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
4945 | ret = res_counter_set_limit(&memcg->kmem, val); | 5020 | ret = res_counter_set_limit(&memcg->kmem, val); |
4946 | out: | 5021 | out: |
4947 | mutex_unlock(&set_limit_mutex); | 5022 | mutex_unlock(&set_limit_mutex); |
4948 | cgroup_unlock(); | 5023 | mutex_unlock(&memcg_create_mutex); |
4949 | |||
4950 | /* | ||
4951 | * We are by now familiar with the fact that we can't inc the static | ||
4952 | * branch inside cgroup_lock. See disarm functions for details. A | ||
4953 | * worker here is overkill, but also wrong: After the limit is set, we | ||
4954 | * must start accounting right away. Since this operation can't fail, | ||
4955 | * we can safely defer it to here - no rollback will be needed. | ||
4956 | * | ||
4957 | * The boolean used to control this is also safe, because | ||
4958 | * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be | ||
4959 | * able to set it to true; | ||
4960 | */ | ||
4961 | if (must_inc_static_branch) { | ||
4962 | static_key_slow_inc(&memcg_kmem_enabled_key); | ||
4963 | /* | ||
4964 | * setting the active bit after the inc will guarantee no one | ||
4965 | * starts accounting before all call sites are patched | ||
4966 | */ | ||
4967 | memcg_kmem_set_active(memcg); | ||
4968 | } | ||
4969 | |||
4970 | #endif | 5024 | #endif |
4971 | return ret; | 5025 | return ret; |
4972 | } | 5026 | } |
4973 | 5027 | ||
5028 | #ifdef CONFIG_MEMCG_KMEM | ||
4974 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) | 5029 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) |
4975 | { | 5030 | { |
4976 | int ret = 0; | 5031 | int ret = 0; |
@@ -4979,7 +5034,6 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) | |||
4979 | goto out; | 5034 | goto out; |
4980 | 5035 | ||
4981 | memcg->kmem_account_flags = parent->kmem_account_flags; | 5036 | memcg->kmem_account_flags = parent->kmem_account_flags; |
4982 | #ifdef CONFIG_MEMCG_KMEM | ||
4983 | /* | 5037 | /* |
4984 | * When that happen, we need to disable the static branch only on those | 5038 | * When that happen, we need to disable the static branch only on those |
4985 | * memcgs that enabled it. To achieve this, we would be forced to | 5039 | * memcgs that enabled it. To achieve this, we would be forced to |
@@ -5005,10 +5059,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) | |||
5005 | mutex_lock(&set_limit_mutex); | 5059 | mutex_lock(&set_limit_mutex); |
5006 | ret = memcg_update_cache_sizes(memcg); | 5060 | ret = memcg_update_cache_sizes(memcg); |
5007 | mutex_unlock(&set_limit_mutex); | 5061 | mutex_unlock(&set_limit_mutex); |
5008 | #endif | ||
5009 | out: | 5062 | out: |
5010 | return ret; | 5063 | return ret; |
5011 | } | 5064 | } |
5065 | #endif /* CONFIG_MEMCG_KMEM */ | ||
5012 | 5066 | ||
5013 | /* | 5067 | /* |
5014 | * The user of this function is... | 5068 | * The user of this function is... |
@@ -5148,15 +5202,14 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | |||
5148 | 5202 | ||
5149 | if (val >= (1 << NR_MOVE_TYPE)) | 5203 | if (val >= (1 << NR_MOVE_TYPE)) |
5150 | return -EINVAL; | 5204 | return -EINVAL; |
5205 | |||
5151 | /* | 5206 | /* |
5152 | * We check this value several times in both in can_attach() and | 5207 | * No kind of locking is needed in here, because ->can_attach() will |
5153 | * attach(), so we need cgroup lock to prevent this value from being | 5208 | * check this value once in the beginning of the process, and then carry |
5154 | * inconsistent. | 5209 | * on with stale data. This means that changes to this value will only |
5210 | * affect task migrations starting after the change. | ||
5155 | */ | 5211 | */ |
5156 | cgroup_lock(); | ||
5157 | memcg->move_charge_at_immigrate = val; | 5212 | memcg->move_charge_at_immigrate = val; |
5158 | cgroup_unlock(); | ||
5159 | |||
5160 | return 0; | 5213 | return 0; |
5161 | } | 5214 | } |
5162 | #else | 5215 | #else |
@@ -5214,14 +5267,6 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
5214 | } | 5267 | } |
5215 | #endif /* CONFIG_NUMA */ | 5268 | #endif /* CONFIG_NUMA */ |
5216 | 5269 | ||
5217 | static const char * const mem_cgroup_lru_names[] = { | ||
5218 | "inactive_anon", | ||
5219 | "active_anon", | ||
5220 | "inactive_file", | ||
5221 | "active_file", | ||
5222 | "unevictable", | ||
5223 | }; | ||
5224 | |||
5225 | static inline void mem_cgroup_lru_names_not_uptodate(void) | 5270 | static inline void mem_cgroup_lru_names_not_uptodate(void) |
5226 | { | 5271 | { |
5227 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 5272 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); |
@@ -5335,18 +5380,17 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | |||
5335 | 5380 | ||
5336 | parent = mem_cgroup_from_cont(cgrp->parent); | 5381 | parent = mem_cgroup_from_cont(cgrp->parent); |
5337 | 5382 | ||
5338 | cgroup_lock(); | 5383 | mutex_lock(&memcg_create_mutex); |
5339 | 5384 | ||
5340 | /* If under hierarchy, only empty-root can set this value */ | 5385 | /* If under hierarchy, only empty-root can set this value */ |
5341 | if ((parent->use_hierarchy) || | 5386 | if ((parent->use_hierarchy) || memcg_has_children(memcg)) { |
5342 | (memcg->use_hierarchy && !list_empty(&cgrp->children))) { | 5387 | mutex_unlock(&memcg_create_mutex); |
5343 | cgroup_unlock(); | ||
5344 | return -EINVAL; | 5388 | return -EINVAL; |
5345 | } | 5389 | } |
5346 | 5390 | ||
5347 | memcg->swappiness = val; | 5391 | memcg->swappiness = val; |
5348 | 5392 | ||
5349 | cgroup_unlock(); | 5393 | mutex_unlock(&memcg_create_mutex); |
5350 | 5394 | ||
5351 | return 0; | 5395 | return 0; |
5352 | } | 5396 | } |
@@ -5672,17 +5716,16 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | |||
5672 | 5716 | ||
5673 | parent = mem_cgroup_from_cont(cgrp->parent); | 5717 | parent = mem_cgroup_from_cont(cgrp->parent); |
5674 | 5718 | ||
5675 | cgroup_lock(); | 5719 | mutex_lock(&memcg_create_mutex); |
5676 | /* oom-kill-disable is a flag for subhierarchy. */ | 5720 | /* oom-kill-disable is a flag for subhierarchy. */ |
5677 | if ((parent->use_hierarchy) || | 5721 | if ((parent->use_hierarchy) || memcg_has_children(memcg)) { |
5678 | (memcg->use_hierarchy && !list_empty(&cgrp->children))) { | 5722 | mutex_unlock(&memcg_create_mutex); |
5679 | cgroup_unlock(); | ||
5680 | return -EINVAL; | 5723 | return -EINVAL; |
5681 | } | 5724 | } |
5682 | memcg->oom_kill_disable = val; | 5725 | memcg->oom_kill_disable = val; |
5683 | if (!val) | 5726 | if (!val) |
5684 | memcg_oom_recover(memcg); | 5727 | memcg_oom_recover(memcg); |
5685 | cgroup_unlock(); | 5728 | mutex_unlock(&memcg_create_mutex); |
5686 | return 0; | 5729 | return 0; |
5687 | } | 5730 | } |
5688 | 5731 | ||
@@ -5797,33 +5840,6 @@ static struct cftype mem_cgroup_files[] = { | |||
5797 | .read_seq_string = memcg_numa_stat_show, | 5840 | .read_seq_string = memcg_numa_stat_show, |
5798 | }, | 5841 | }, |
5799 | #endif | 5842 | #endif |
5800 | #ifdef CONFIG_MEMCG_SWAP | ||
5801 | { | ||
5802 | .name = "memsw.usage_in_bytes", | ||
5803 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
5804 | .read = mem_cgroup_read, | ||
5805 | .register_event = mem_cgroup_usage_register_event, | ||
5806 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
5807 | }, | ||
5808 | { | ||
5809 | .name = "memsw.max_usage_in_bytes", | ||
5810 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
5811 | .trigger = mem_cgroup_reset, | ||
5812 | .read = mem_cgroup_read, | ||
5813 | }, | ||
5814 | { | ||
5815 | .name = "memsw.limit_in_bytes", | ||
5816 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
5817 | .write_string = mem_cgroup_write, | ||
5818 | .read = mem_cgroup_read, | ||
5819 | }, | ||
5820 | { | ||
5821 | .name = "memsw.failcnt", | ||
5822 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
5823 | .trigger = mem_cgroup_reset, | ||
5824 | .read = mem_cgroup_read, | ||
5825 | }, | ||
5826 | #endif | ||
5827 | #ifdef CONFIG_MEMCG_KMEM | 5843 | #ifdef CONFIG_MEMCG_KMEM |
5828 | { | 5844 | { |
5829 | .name = "kmem.limit_in_bytes", | 5845 | .name = "kmem.limit_in_bytes", |
@@ -5858,6 +5874,36 @@ static struct cftype mem_cgroup_files[] = { | |||
5858 | { }, /* terminate */ | 5874 | { }, /* terminate */ |
5859 | }; | 5875 | }; |
5860 | 5876 | ||
5877 | #ifdef CONFIG_MEMCG_SWAP | ||
5878 | static struct cftype memsw_cgroup_files[] = { | ||
5879 | { | ||
5880 | .name = "memsw.usage_in_bytes", | ||
5881 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
5882 | .read = mem_cgroup_read, | ||
5883 | .register_event = mem_cgroup_usage_register_event, | ||
5884 | .unregister_event = mem_cgroup_usage_unregister_event, | ||
5885 | }, | ||
5886 | { | ||
5887 | .name = "memsw.max_usage_in_bytes", | ||
5888 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
5889 | .trigger = mem_cgroup_reset, | ||
5890 | .read = mem_cgroup_read, | ||
5891 | }, | ||
5892 | { | ||
5893 | .name = "memsw.limit_in_bytes", | ||
5894 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
5895 | .write_string = mem_cgroup_write, | ||
5896 | .read = mem_cgroup_read, | ||
5897 | }, | ||
5898 | { | ||
5899 | .name = "memsw.failcnt", | ||
5900 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
5901 | .trigger = mem_cgroup_reset, | ||
5902 | .read = mem_cgroup_read, | ||
5903 | }, | ||
5904 | { }, /* terminate */ | ||
5905 | }; | ||
5906 | #endif | ||
5861 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | 5907 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
5862 | { | 5908 | { |
5863 | struct mem_cgroup_per_node *pn; | 5909 | struct mem_cgroup_per_node *pn; |
@@ -5896,9 +5942,9 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
5896 | static struct mem_cgroup *mem_cgroup_alloc(void) | 5942 | static struct mem_cgroup *mem_cgroup_alloc(void) |
5897 | { | 5943 | { |
5898 | struct mem_cgroup *memcg; | 5944 | struct mem_cgroup *memcg; |
5899 | int size = sizeof(struct mem_cgroup); | 5945 | size_t size = memcg_size(); |
5900 | 5946 | ||
5901 | /* Can be very big if MAX_NUMNODES is very big */ | 5947 | /* Can be very big if nr_node_ids is very big */ |
5902 | if (size < PAGE_SIZE) | 5948 | if (size < PAGE_SIZE) |
5903 | memcg = kzalloc(size, GFP_KERNEL); | 5949 | memcg = kzalloc(size, GFP_KERNEL); |
5904 | else | 5950 | else |
@@ -5935,7 +5981,7 @@ out_free: | |||
5935 | static void __mem_cgroup_free(struct mem_cgroup *memcg) | 5981 | static void __mem_cgroup_free(struct mem_cgroup *memcg) |
5936 | { | 5982 | { |
5937 | int node; | 5983 | int node; |
5938 | int size = sizeof(struct mem_cgroup); | 5984 | size_t size = memcg_size(); |
5939 | 5985 | ||
5940 | mem_cgroup_remove_from_trees(memcg); | 5986 | mem_cgroup_remove_from_trees(memcg); |
5941 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 5987 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
@@ -6017,19 +6063,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
6017 | } | 6063 | } |
6018 | EXPORT_SYMBOL(parent_mem_cgroup); | 6064 | EXPORT_SYMBOL(parent_mem_cgroup); |
6019 | 6065 | ||
6020 | #ifdef CONFIG_MEMCG_SWAP | 6066 | static void __init mem_cgroup_soft_limit_tree_init(void) |
6021 | static void __init enable_swap_cgroup(void) | ||
6022 | { | ||
6023 | if (!mem_cgroup_disabled() && really_do_swap_account) | ||
6024 | do_swap_account = 1; | ||
6025 | } | ||
6026 | #else | ||
6027 | static void __init enable_swap_cgroup(void) | ||
6028 | { | ||
6029 | } | ||
6030 | #endif | ||
6031 | |||
6032 | static int mem_cgroup_soft_limit_tree_init(void) | ||
6033 | { | 6067 | { |
6034 | struct mem_cgroup_tree_per_node *rtpn; | 6068 | struct mem_cgroup_tree_per_node *rtpn; |
6035 | struct mem_cgroup_tree_per_zone *rtpz; | 6069 | struct mem_cgroup_tree_per_zone *rtpz; |
@@ -6040,8 +6074,7 @@ static int mem_cgroup_soft_limit_tree_init(void) | |||
6040 | if (!node_state(node, N_NORMAL_MEMORY)) | 6074 | if (!node_state(node, N_NORMAL_MEMORY)) |
6041 | tmp = -1; | 6075 | tmp = -1; |
6042 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | 6076 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); |
6043 | if (!rtpn) | 6077 | BUG_ON(!rtpn); |
6044 | goto err_cleanup; | ||
6045 | 6078 | ||
6046 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | 6079 | soft_limit_tree.rb_tree_per_node[node] = rtpn; |
6047 | 6080 | ||
@@ -6051,23 +6084,12 @@ static int mem_cgroup_soft_limit_tree_init(void) | |||
6051 | spin_lock_init(&rtpz->lock); | 6084 | spin_lock_init(&rtpz->lock); |
6052 | } | 6085 | } |
6053 | } | 6086 | } |
6054 | return 0; | ||
6055 | |||
6056 | err_cleanup: | ||
6057 | for_each_node(node) { | ||
6058 | if (!soft_limit_tree.rb_tree_per_node[node]) | ||
6059 | break; | ||
6060 | kfree(soft_limit_tree.rb_tree_per_node[node]); | ||
6061 | soft_limit_tree.rb_tree_per_node[node] = NULL; | ||
6062 | } | ||
6063 | return 1; | ||
6064 | |||
6065 | } | 6087 | } |
6066 | 6088 | ||
6067 | static struct cgroup_subsys_state * __ref | 6089 | static struct cgroup_subsys_state * __ref |
6068 | mem_cgroup_css_alloc(struct cgroup *cont) | 6090 | mem_cgroup_css_alloc(struct cgroup *cont) |
6069 | { | 6091 | { |
6070 | struct mem_cgroup *memcg, *parent; | 6092 | struct mem_cgroup *memcg; |
6071 | long error = -ENOMEM; | 6093 | long error = -ENOMEM; |
6072 | int node; | 6094 | int node; |
6073 | 6095 | ||
@@ -6081,24 +6103,44 @@ mem_cgroup_css_alloc(struct cgroup *cont) | |||
6081 | 6103 | ||
6082 | /* root ? */ | 6104 | /* root ? */ |
6083 | if (cont->parent == NULL) { | 6105 | if (cont->parent == NULL) { |
6084 | int cpu; | ||
6085 | enable_swap_cgroup(); | ||
6086 | parent = NULL; | ||
6087 | if (mem_cgroup_soft_limit_tree_init()) | ||
6088 | goto free_out; | ||
6089 | root_mem_cgroup = memcg; | 6106 | root_mem_cgroup = memcg; |
6090 | for_each_possible_cpu(cpu) { | 6107 | res_counter_init(&memcg->res, NULL); |
6091 | struct memcg_stock_pcp *stock = | 6108 | res_counter_init(&memcg->memsw, NULL); |
6092 | &per_cpu(memcg_stock, cpu); | 6109 | res_counter_init(&memcg->kmem, NULL); |
6093 | INIT_WORK(&stock->work, drain_local_stock); | ||
6094 | } | ||
6095 | } else { | ||
6096 | parent = mem_cgroup_from_cont(cont->parent); | ||
6097 | memcg->use_hierarchy = parent->use_hierarchy; | ||
6098 | memcg->oom_kill_disable = parent->oom_kill_disable; | ||
6099 | } | 6110 | } |
6100 | 6111 | ||
6101 | if (parent && parent->use_hierarchy) { | 6112 | memcg->last_scanned_node = MAX_NUMNODES; |
6113 | INIT_LIST_HEAD(&memcg->oom_notify); | ||
6114 | atomic_set(&memcg->refcnt, 1); | ||
6115 | memcg->move_charge_at_immigrate = 0; | ||
6116 | mutex_init(&memcg->thresholds_lock); | ||
6117 | spin_lock_init(&memcg->move_lock); | ||
6118 | |||
6119 | return &memcg->css; | ||
6120 | |||
6121 | free_out: | ||
6122 | __mem_cgroup_free(memcg); | ||
6123 | return ERR_PTR(error); | ||
6124 | } | ||
6125 | |||
6126 | static int | ||
6127 | mem_cgroup_css_online(struct cgroup *cont) | ||
6128 | { | ||
6129 | struct mem_cgroup *memcg, *parent; | ||
6130 | int error = 0; | ||
6131 | |||
6132 | if (!cont->parent) | ||
6133 | return 0; | ||
6134 | |||
6135 | mutex_lock(&memcg_create_mutex); | ||
6136 | memcg = mem_cgroup_from_cont(cont); | ||
6137 | parent = mem_cgroup_from_cont(cont->parent); | ||
6138 | |||
6139 | memcg->use_hierarchy = parent->use_hierarchy; | ||
6140 | memcg->oom_kill_disable = parent->oom_kill_disable; | ||
6141 | memcg->swappiness = mem_cgroup_swappiness(parent); | ||
6142 | |||
6143 | if (parent->use_hierarchy) { | ||
6102 | res_counter_init(&memcg->res, &parent->res); | 6144 | res_counter_init(&memcg->res, &parent->res); |
6103 | res_counter_init(&memcg->memsw, &parent->memsw); | 6145 | res_counter_init(&memcg->memsw, &parent->memsw); |
6104 | res_counter_init(&memcg->kmem, &parent->kmem); | 6146 | res_counter_init(&memcg->kmem, &parent->kmem); |
@@ -6119,20 +6161,12 @@ mem_cgroup_css_alloc(struct cgroup *cont) | |||
6119 | * much sense so let cgroup subsystem know about this | 6161 | * much sense so let cgroup subsystem know about this |
6120 | * unfortunate state in our controller. | 6162 | * unfortunate state in our controller. |
6121 | */ | 6163 | */ |
6122 | if (parent && parent != root_mem_cgroup) | 6164 | if (parent != root_mem_cgroup) |
6123 | mem_cgroup_subsys.broken_hierarchy = true; | 6165 | mem_cgroup_subsys.broken_hierarchy = true; |
6124 | } | 6166 | } |
6125 | memcg->last_scanned_node = MAX_NUMNODES; | ||
6126 | INIT_LIST_HEAD(&memcg->oom_notify); | ||
6127 | |||
6128 | if (parent) | ||
6129 | memcg->swappiness = mem_cgroup_swappiness(parent); | ||
6130 | atomic_set(&memcg->refcnt, 1); | ||
6131 | memcg->move_charge_at_immigrate = 0; | ||
6132 | mutex_init(&memcg->thresholds_lock); | ||
6133 | spin_lock_init(&memcg->move_lock); | ||
6134 | 6167 | ||
6135 | error = memcg_init_kmem(memcg, &mem_cgroup_subsys); | 6168 | error = memcg_init_kmem(memcg, &mem_cgroup_subsys); |
6169 | mutex_unlock(&memcg_create_mutex); | ||
6136 | if (error) { | 6170 | if (error) { |
6137 | /* | 6171 | /* |
6138 | * We call put now because our (and parent's) refcnts | 6172 | * We call put now because our (and parent's) refcnts |
@@ -6140,12 +6174,10 @@ mem_cgroup_css_alloc(struct cgroup *cont) | |||
6140 | * call __mem_cgroup_free, so return directly | 6174 | * call __mem_cgroup_free, so return directly |
6141 | */ | 6175 | */ |
6142 | mem_cgroup_put(memcg); | 6176 | mem_cgroup_put(memcg); |
6143 | return ERR_PTR(error); | 6177 | if (parent->use_hierarchy) |
6178 | mem_cgroup_put(parent); | ||
6144 | } | 6179 | } |
6145 | return &memcg->css; | 6180 | return error; |
6146 | free_out: | ||
6147 | __mem_cgroup_free(memcg); | ||
6148 | return ERR_PTR(error); | ||
6149 | } | 6181 | } |
6150 | 6182 | ||
6151 | static void mem_cgroup_css_offline(struct cgroup *cont) | 6183 | static void mem_cgroup_css_offline(struct cgroup *cont) |
@@ -6281,7 +6313,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | |||
6281 | * Because lookup_swap_cache() updates some statistics counter, | 6313 | * Because lookup_swap_cache() updates some statistics counter, |
6282 | * we call find_get_page() with swapper_space directly. | 6314 | * we call find_get_page() with swapper_space directly. |
6283 | */ | 6315 | */ |
6284 | page = find_get_page(&swapper_space, ent.val); | 6316 | page = find_get_page(swap_address_space(ent), ent.val); |
6285 | if (do_swap_account) | 6317 | if (do_swap_account) |
6286 | entry->val = ent.val; | 6318 | entry->val = ent.val; |
6287 | 6319 | ||
@@ -6322,7 +6354,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
6322 | swp_entry_t swap = radix_to_swp_entry(page); | 6354 | swp_entry_t swap = radix_to_swp_entry(page); |
6323 | if (do_swap_account) | 6355 | if (do_swap_account) |
6324 | *entry = swap; | 6356 | *entry = swap; |
6325 | page = find_get_page(&swapper_space, swap.val); | 6357 | page = find_get_page(swap_address_space(swap), swap.val); |
6326 | } | 6358 | } |
6327 | #endif | 6359 | #endif |
6328 | return page; | 6360 | return page; |
@@ -6532,8 +6564,15 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, | |||
6532 | struct task_struct *p = cgroup_taskset_first(tset); | 6564 | struct task_struct *p = cgroup_taskset_first(tset); |
6533 | int ret = 0; | 6565 | int ret = 0; |
6534 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); | 6566 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); |
6567 | unsigned long move_charge_at_immigrate; | ||
6535 | 6568 | ||
6536 | if (memcg->move_charge_at_immigrate) { | 6569 | /* |
6570 | * We are now commited to this value whatever it is. Changes in this | ||
6571 | * tunable will only affect upcoming migrations, not the current one. | ||
6572 | * So we need to save it, and keep it going. | ||
6573 | */ | ||
6574 | move_charge_at_immigrate = memcg->move_charge_at_immigrate; | ||
6575 | if (move_charge_at_immigrate) { | ||
6537 | struct mm_struct *mm; | 6576 | struct mm_struct *mm; |
6538 | struct mem_cgroup *from = mem_cgroup_from_task(p); | 6577 | struct mem_cgroup *from = mem_cgroup_from_task(p); |
6539 | 6578 | ||
@@ -6553,6 +6592,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, | |||
6553 | spin_lock(&mc.lock); | 6592 | spin_lock(&mc.lock); |
6554 | mc.from = from; | 6593 | mc.from = from; |
6555 | mc.to = memcg; | 6594 | mc.to = memcg; |
6595 | mc.immigrate_flags = move_charge_at_immigrate; | ||
6556 | spin_unlock(&mc.lock); | 6596 | spin_unlock(&mc.lock); |
6557 | /* We set mc.moving_task later */ | 6597 | /* We set mc.moving_task later */ |
6558 | 6598 | ||
@@ -6747,6 +6787,7 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
6747 | .name = "memory", | 6787 | .name = "memory", |
6748 | .subsys_id = mem_cgroup_subsys_id, | 6788 | .subsys_id = mem_cgroup_subsys_id, |
6749 | .css_alloc = mem_cgroup_css_alloc, | 6789 | .css_alloc = mem_cgroup_css_alloc, |
6790 | .css_online = mem_cgroup_css_online, | ||
6750 | .css_offline = mem_cgroup_css_offline, | 6791 | .css_offline = mem_cgroup_css_offline, |
6751 | .css_free = mem_cgroup_css_free, | 6792 | .css_free = mem_cgroup_css_free, |
6752 | .can_attach = mem_cgroup_can_attach, | 6793 | .can_attach = mem_cgroup_can_attach, |
@@ -6757,19 +6798,6 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
6757 | .use_id = 1, | 6798 | .use_id = 1, |
6758 | }; | 6799 | }; |
6759 | 6800 | ||
6760 | /* | ||
6761 | * The rest of init is performed during ->css_alloc() for root css which | ||
6762 | * happens before initcalls. hotcpu_notifier() can't be done together as | ||
6763 | * it would introduce circular locking by adding cgroup_lock -> cpu hotplug | ||
6764 | * dependency. Do it from a subsys_initcall(). | ||
6765 | */ | ||
6766 | static int __init mem_cgroup_init(void) | ||
6767 | { | ||
6768 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | ||
6769 | return 0; | ||
6770 | } | ||
6771 | subsys_initcall(mem_cgroup_init); | ||
6772 | |||
6773 | #ifdef CONFIG_MEMCG_SWAP | 6801 | #ifdef CONFIG_MEMCG_SWAP |
6774 | static int __init enable_swap_account(char *s) | 6802 | static int __init enable_swap_account(char *s) |
6775 | { | 6803 | { |
@@ -6782,4 +6810,39 @@ static int __init enable_swap_account(char *s) | |||
6782 | } | 6810 | } |
6783 | __setup("swapaccount=", enable_swap_account); | 6811 | __setup("swapaccount=", enable_swap_account); |
6784 | 6812 | ||
6813 | static void __init memsw_file_init(void) | ||
6814 | { | ||
6815 | WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); | ||
6816 | } | ||
6817 | |||
6818 | static void __init enable_swap_cgroup(void) | ||
6819 | { | ||
6820 | if (!mem_cgroup_disabled() && really_do_swap_account) { | ||
6821 | do_swap_account = 1; | ||
6822 | memsw_file_init(); | ||
6823 | } | ||
6824 | } | ||
6825 | |||
6826 | #else | ||
6827 | static void __init enable_swap_cgroup(void) | ||
6828 | { | ||
6829 | } | ||
6785 | #endif | 6830 | #endif |
6831 | |||
6832 | /* | ||
6833 | * subsys_initcall() for memory controller. | ||
6834 | * | ||
6835 | * Some parts like hotcpu_notifier() have to be initialized from this context | ||
6836 | * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically | ||
6837 | * everything that doesn't depend on a specific mem_cgroup structure should | ||
6838 | * be initialized from here. | ||
6839 | */ | ||
6840 | static int __init mem_cgroup_init(void) | ||
6841 | { | ||
6842 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | ||
6843 | enable_swap_cgroup(); | ||
6844 | mem_cgroup_soft_limit_tree_init(); | ||
6845 | memcg_stock_init(); | ||
6846 | return 0; | ||
6847 | } | ||
6848 | subsys_initcall(mem_cgroup_init); | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c6e4dd3e1c08..df0694c6adef 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -61,7 +61,7 @@ int sysctl_memory_failure_early_kill __read_mostly = 0; | |||
61 | 61 | ||
62 | int sysctl_memory_failure_recovery __read_mostly = 1; | 62 | int sysctl_memory_failure_recovery __read_mostly = 1; |
63 | 63 | ||
64 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); | 64 | atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); |
65 | 65 | ||
66 | #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) | 66 | #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) |
67 | 67 | ||
@@ -784,12 +784,12 @@ static struct page_state { | |||
784 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, | 784 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, |
785 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, | 785 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, |
786 | 786 | ||
787 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, | ||
788 | { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, | ||
789 | |||
790 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, | 787 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, |
791 | { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, | 788 | { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, |
792 | 789 | ||
790 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, | ||
791 | { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, | ||
792 | |||
793 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, | 793 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, |
794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 794 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
795 | 795 | ||
@@ -1021,6 +1021,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1021 | struct page *hpage; | 1021 | struct page *hpage; |
1022 | int res; | 1022 | int res; |
1023 | unsigned int nr_pages; | 1023 | unsigned int nr_pages; |
1024 | unsigned long page_flags; | ||
1024 | 1025 | ||
1025 | if (!sysctl_memory_failure_recovery) | 1026 | if (!sysctl_memory_failure_recovery) |
1026 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | 1027 | panic("Memory failure from trap %d on page %lx", trapno, pfn); |
@@ -1039,8 +1040,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1039 | return 0; | 1040 | return 0; |
1040 | } | 1041 | } |
1041 | 1042 | ||
1042 | nr_pages = 1 << compound_trans_order(hpage); | 1043 | /* |
1043 | atomic_long_add(nr_pages, &mce_bad_pages); | 1044 | * Currently errors on hugetlbfs pages are measured in hugepage units, |
1045 | * so nr_pages should be 1 << compound_order. OTOH when errors are on | ||
1046 | * transparent hugepages, they are supposed to be split and error | ||
1047 | * measurement is done in normal page units. So nr_pages should be one | ||
1048 | * in this case. | ||
1049 | */ | ||
1050 | if (PageHuge(p)) | ||
1051 | nr_pages = 1 << compound_order(hpage); | ||
1052 | else /* normal page or thp */ | ||
1053 | nr_pages = 1; | ||
1054 | atomic_long_add(nr_pages, &num_poisoned_pages); | ||
1044 | 1055 | ||
1045 | /* | 1056 | /* |
1046 | * We need/can do nothing about count=0 pages. | 1057 | * We need/can do nothing about count=0 pages. |
@@ -1070,7 +1081,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1070 | if (!PageHWPoison(hpage) | 1081 | if (!PageHWPoison(hpage) |
1071 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) | 1082 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) |
1072 | || (p != hpage && TestSetPageHWPoison(hpage))) { | 1083 | || (p != hpage && TestSetPageHWPoison(hpage))) { |
1073 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1084 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
1074 | return 0; | 1085 | return 0; |
1075 | } | 1086 | } |
1076 | set_page_hwpoison_huge_page(hpage); | 1087 | set_page_hwpoison_huge_page(hpage); |
@@ -1119,6 +1130,15 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1119 | lock_page(hpage); | 1130 | lock_page(hpage); |
1120 | 1131 | ||
1121 | /* | 1132 | /* |
1133 | * We use page flags to determine what action should be taken, but | ||
1134 | * the flags can be modified by the error containment action. One | ||
1135 | * example is an mlocked page, where PG_mlocked is cleared by | ||
1136 | * page_remove_rmap() in try_to_unmap_one(). So to determine page status | ||
1137 | * correctly, we save a copy of the page flags at this time. | ||
1138 | */ | ||
1139 | page_flags = p->flags; | ||
1140 | |||
1141 | /* | ||
1122 | * unpoison always clear PG_hwpoison inside page lock | 1142 | * unpoison always clear PG_hwpoison inside page lock |
1123 | */ | 1143 | */ |
1124 | if (!PageHWPoison(p)) { | 1144 | if (!PageHWPoison(p)) { |
@@ -1128,7 +1148,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1128 | } | 1148 | } |
1129 | if (hwpoison_filter(p)) { | 1149 | if (hwpoison_filter(p)) { |
1130 | if (TestClearPageHWPoison(p)) | 1150 | if (TestClearPageHWPoison(p)) |
1131 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1151 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
1132 | unlock_page(hpage); | 1152 | unlock_page(hpage); |
1133 | put_page(hpage); | 1153 | put_page(hpage); |
1134 | return 0; | 1154 | return 0; |
@@ -1176,12 +1196,19 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1176 | } | 1196 | } |
1177 | 1197 | ||
1178 | res = -EBUSY; | 1198 | res = -EBUSY; |
1179 | for (ps = error_states;; ps++) { | 1199 | /* |
1180 | if ((p->flags & ps->mask) == ps->res) { | 1200 | * The first check uses the current page flags which may not have any |
1181 | res = page_action(ps, p, pfn); | 1201 | * relevant information. The second check with the saved page flagss is |
1202 | * carried out only if the first check can't determine the page status. | ||
1203 | */ | ||
1204 | for (ps = error_states;; ps++) | ||
1205 | if ((p->flags & ps->mask) == ps->res) | ||
1182 | break; | 1206 | break; |
1183 | } | 1207 | if (!ps->mask) |
1184 | } | 1208 | for (ps = error_states;; ps++) |
1209 | if ((page_flags & ps->mask) == ps->res) | ||
1210 | break; | ||
1211 | res = page_action(ps, p, pfn); | ||
1185 | out: | 1212 | out: |
1186 | unlock_page(hpage); | 1213 | unlock_page(hpage); |
1187 | return res; | 1214 | return res; |
@@ -1323,7 +1350,7 @@ int unpoison_memory(unsigned long pfn) | |||
1323 | return 0; | 1350 | return 0; |
1324 | } | 1351 | } |
1325 | if (TestClearPageHWPoison(p)) | 1352 | if (TestClearPageHWPoison(p)) |
1326 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1353 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
1327 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); | 1354 | pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); |
1328 | return 0; | 1355 | return 0; |
1329 | } | 1356 | } |
@@ -1337,7 +1364,7 @@ int unpoison_memory(unsigned long pfn) | |||
1337 | */ | 1364 | */ |
1338 | if (TestClearPageHWPoison(page)) { | 1365 | if (TestClearPageHWPoison(page)) { |
1339 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); | 1366 | pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); |
1340 | atomic_long_sub(nr_pages, &mce_bad_pages); | 1367 | atomic_long_sub(nr_pages, &num_poisoned_pages); |
1341 | freeit = 1; | 1368 | freeit = 1; |
1342 | if (PageHuge(page)) | 1369 | if (PageHuge(page)) |
1343 | clear_page_hwpoison_huge_page(page); | 1370 | clear_page_hwpoison_huge_page(page); |
@@ -1368,7 +1395,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x) | |||
1368 | * that is not free, and 1 for any other page type. | 1395 | * that is not free, and 1 for any other page type. |
1369 | * For 1 the page is returned with increased page count, otherwise not. | 1396 | * For 1 the page is returned with increased page count, otherwise not. |
1370 | */ | 1397 | */ |
1371 | static int get_any_page(struct page *p, unsigned long pfn, int flags) | 1398 | static int __get_any_page(struct page *p, unsigned long pfn, int flags) |
1372 | { | 1399 | { |
1373 | int ret; | 1400 | int ret; |
1374 | 1401 | ||
@@ -1393,11 +1420,9 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1393 | if (!get_page_unless_zero(compound_head(p))) { | 1420 | if (!get_page_unless_zero(compound_head(p))) { |
1394 | if (PageHuge(p)) { | 1421 | if (PageHuge(p)) { |
1395 | pr_info("%s: %#lx free huge page\n", __func__, pfn); | 1422 | pr_info("%s: %#lx free huge page\n", __func__, pfn); |
1396 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); | 1423 | ret = 0; |
1397 | } else if (is_free_buddy_page(p)) { | 1424 | } else if (is_free_buddy_page(p)) { |
1398 | pr_info("%s: %#lx free buddy page\n", __func__, pfn); | 1425 | pr_info("%s: %#lx free buddy page\n", __func__, pfn); |
1399 | /* Set hwpoison bit while page is still isolated */ | ||
1400 | SetPageHWPoison(p); | ||
1401 | ret = 0; | 1426 | ret = 0; |
1402 | } else { | 1427 | } else { |
1403 | pr_info("%s: %#lx: unknown zero refcount page type %lx\n", | 1428 | pr_info("%s: %#lx: unknown zero refcount page type %lx\n", |
@@ -1413,43 +1438,68 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1413 | return ret; | 1438 | return ret; |
1414 | } | 1439 | } |
1415 | 1440 | ||
1441 | static int get_any_page(struct page *page, unsigned long pfn, int flags) | ||
1442 | { | ||
1443 | int ret = __get_any_page(page, pfn, flags); | ||
1444 | |||
1445 | if (ret == 1 && !PageHuge(page) && !PageLRU(page)) { | ||
1446 | /* | ||
1447 | * Try to free it. | ||
1448 | */ | ||
1449 | put_page(page); | ||
1450 | shake_page(page, 1); | ||
1451 | |||
1452 | /* | ||
1453 | * Did it turn free? | ||
1454 | */ | ||
1455 | ret = __get_any_page(page, pfn, 0); | ||
1456 | if (!PageLRU(page)) { | ||
1457 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", | ||
1458 | pfn, page->flags); | ||
1459 | return -EIO; | ||
1460 | } | ||
1461 | } | ||
1462 | return ret; | ||
1463 | } | ||
1464 | |||
1416 | static int soft_offline_huge_page(struct page *page, int flags) | 1465 | static int soft_offline_huge_page(struct page *page, int flags) |
1417 | { | 1466 | { |
1418 | int ret; | 1467 | int ret; |
1419 | unsigned long pfn = page_to_pfn(page); | 1468 | unsigned long pfn = page_to_pfn(page); |
1420 | struct page *hpage = compound_head(page); | 1469 | struct page *hpage = compound_head(page); |
1421 | 1470 | ||
1422 | ret = get_any_page(page, pfn, flags); | 1471 | /* |
1423 | if (ret < 0) | 1472 | * This double-check of PageHWPoison is to avoid the race with |
1424 | return ret; | 1473 | * memory_failure(). See also comment in __soft_offline_page(). |
1425 | if (ret == 0) | 1474 | */ |
1426 | goto done; | 1475 | lock_page(hpage); |
1427 | |||
1428 | if (PageHWPoison(hpage)) { | 1476 | if (PageHWPoison(hpage)) { |
1477 | unlock_page(hpage); | ||
1429 | put_page(hpage); | 1478 | put_page(hpage); |
1430 | pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); | 1479 | pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); |
1431 | return -EBUSY; | 1480 | return -EBUSY; |
1432 | } | 1481 | } |
1482 | unlock_page(hpage); | ||
1433 | 1483 | ||
1434 | /* Keep page count to indicate a given hugepage is isolated. */ | 1484 | /* Keep page count to indicate a given hugepage is isolated. */ |
1435 | ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, | 1485 | ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, |
1436 | MIGRATE_SYNC); | 1486 | MIGRATE_SYNC); |
1437 | put_page(hpage); | 1487 | put_page(hpage); |
1438 | if (ret) { | 1488 | if (ret) { |
1439 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1489 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1440 | pfn, ret, page->flags); | 1490 | pfn, ret, page->flags); |
1441 | return ret; | 1491 | } else { |
1442 | } | 1492 | set_page_hwpoison_huge_page(hpage); |
1443 | done: | 1493 | dequeue_hwpoisoned_huge_page(hpage); |
1444 | if (!PageHWPoison(hpage)) | ||
1445 | atomic_long_add(1 << compound_trans_order(hpage), | 1494 | atomic_long_add(1 << compound_trans_order(hpage), |
1446 | &mce_bad_pages); | 1495 | &num_poisoned_pages); |
1447 | set_page_hwpoison_huge_page(hpage); | 1496 | } |
1448 | dequeue_hwpoisoned_huge_page(hpage); | ||
1449 | /* keep elevated page count for bad page */ | 1497 | /* keep elevated page count for bad page */ |
1450 | return ret; | 1498 | return ret; |
1451 | } | 1499 | } |
1452 | 1500 | ||
1501 | static int __soft_offline_page(struct page *page, int flags); | ||
1502 | |||
1453 | /** | 1503 | /** |
1454 | * soft_offline_page - Soft offline a page. | 1504 | * soft_offline_page - Soft offline a page. |
1455 | * @page: page to offline | 1505 | * @page: page to offline |
@@ -1478,9 +1528,11 @@ int soft_offline_page(struct page *page, int flags) | |||
1478 | unsigned long pfn = page_to_pfn(page); | 1528 | unsigned long pfn = page_to_pfn(page); |
1479 | struct page *hpage = compound_trans_head(page); | 1529 | struct page *hpage = compound_trans_head(page); |
1480 | 1530 | ||
1481 | if (PageHuge(page)) | 1531 | if (PageHWPoison(page)) { |
1482 | return soft_offline_huge_page(page, flags); | 1532 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
1483 | if (PageTransHuge(hpage)) { | 1533 | return -EBUSY; |
1534 | } | ||
1535 | if (!PageHuge(page) && PageTransHuge(hpage)) { | ||
1484 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { | 1536 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { |
1485 | pr_info("soft offline: %#lx: failed to split THP\n", | 1537 | pr_info("soft offline: %#lx: failed to split THP\n", |
1486 | pfn); | 1538 | pfn); |
@@ -1491,47 +1543,45 @@ int soft_offline_page(struct page *page, int flags) | |||
1491 | ret = get_any_page(page, pfn, flags); | 1543 | ret = get_any_page(page, pfn, flags); |
1492 | if (ret < 0) | 1544 | if (ret < 0) |
1493 | return ret; | 1545 | return ret; |
1494 | if (ret == 0) | 1546 | if (ret) { /* for in-use pages */ |
1495 | goto done; | 1547 | if (PageHuge(page)) |
1496 | 1548 | ret = soft_offline_huge_page(page, flags); | |
1497 | /* | 1549 | else |
1498 | * Page cache page we can handle? | 1550 | ret = __soft_offline_page(page, flags); |
1499 | */ | 1551 | } else { /* for free pages */ |
1500 | if (!PageLRU(page)) { | 1552 | if (PageHuge(page)) { |
1501 | /* | 1553 | set_page_hwpoison_huge_page(hpage); |
1502 | * Try to free it. | 1554 | dequeue_hwpoisoned_huge_page(hpage); |
1503 | */ | 1555 | atomic_long_add(1 << compound_trans_order(hpage), |
1504 | put_page(page); | 1556 | &num_poisoned_pages); |
1505 | shake_page(page, 1); | 1557 | } else { |
1506 | 1558 | SetPageHWPoison(page); | |
1507 | /* | 1559 | atomic_long_inc(&num_poisoned_pages); |
1508 | * Did it turn free? | 1560 | } |
1509 | */ | ||
1510 | ret = get_any_page(page, pfn, 0); | ||
1511 | if (ret < 0) | ||
1512 | return ret; | ||
1513 | if (ret == 0) | ||
1514 | goto done; | ||
1515 | } | ||
1516 | if (!PageLRU(page)) { | ||
1517 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", | ||
1518 | pfn, page->flags); | ||
1519 | return -EIO; | ||
1520 | } | 1561 | } |
1562 | /* keep elevated page count for bad page */ | ||
1563 | return ret; | ||
1564 | } | ||
1521 | 1565 | ||
1522 | lock_page(page); | 1566 | static int __soft_offline_page(struct page *page, int flags) |
1523 | wait_on_page_writeback(page); | 1567 | { |
1568 | int ret; | ||
1569 | unsigned long pfn = page_to_pfn(page); | ||
1524 | 1570 | ||
1525 | /* | 1571 | /* |
1526 | * Synchronized using the page lock with memory_failure() | 1572 | * Check PageHWPoison again inside page lock because PageHWPoison |
1573 | * is set by memory_failure() outside page lock. Note that | ||
1574 | * memory_failure() also double-checks PageHWPoison inside page lock, | ||
1575 | * so there's no race between soft_offline_page() and memory_failure(). | ||
1527 | */ | 1576 | */ |
1577 | lock_page(page); | ||
1578 | wait_on_page_writeback(page); | ||
1528 | if (PageHWPoison(page)) { | 1579 | if (PageHWPoison(page)) { |
1529 | unlock_page(page); | 1580 | unlock_page(page); |
1530 | put_page(page); | 1581 | put_page(page); |
1531 | pr_info("soft offline: %#lx page already poisoned\n", pfn); | 1582 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
1532 | return -EBUSY; | 1583 | return -EBUSY; |
1533 | } | 1584 | } |
1534 | |||
1535 | /* | 1585 | /* |
1536 | * Try to invalidate first. This should work for | 1586 | * Try to invalidate first. This should work for |
1537 | * non dirty unmapped page cache pages. | 1587 | * non dirty unmapped page cache pages. |
@@ -1544,9 +1594,10 @@ int soft_offline_page(struct page *page, int flags) | |||
1544 | */ | 1594 | */ |
1545 | if (ret == 1) { | 1595 | if (ret == 1) { |
1546 | put_page(page); | 1596 | put_page(page); |
1547 | ret = 0; | ||
1548 | pr_info("soft_offline: %#lx: invalidated\n", pfn); | 1597 | pr_info("soft_offline: %#lx: invalidated\n", pfn); |
1549 | goto done; | 1598 | SetPageHWPoison(page); |
1599 | atomic_long_inc(&num_poisoned_pages); | ||
1600 | return 0; | ||
1550 | } | 1601 | } |
1551 | 1602 | ||
1552 | /* | 1603 | /* |
@@ -1563,28 +1614,23 @@ int soft_offline_page(struct page *page, int flags) | |||
1563 | if (!ret) { | 1614 | if (!ret) { |
1564 | LIST_HEAD(pagelist); | 1615 | LIST_HEAD(pagelist); |
1565 | inc_zone_page_state(page, NR_ISOLATED_ANON + | 1616 | inc_zone_page_state(page, NR_ISOLATED_ANON + |
1566 | page_is_file_cache(page)); | 1617 | page_is_file_cache(page)); |
1567 | list_add(&page->lru, &pagelist); | 1618 | list_add(&page->lru, &pagelist); |
1568 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1619 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1569 | false, MIGRATE_SYNC, | 1620 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1570 | MR_MEMORY_FAILURE); | ||
1571 | if (ret) { | 1621 | if (ret) { |
1572 | putback_lru_pages(&pagelist); | 1622 | putback_lru_pages(&pagelist); |
1573 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1623 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
1574 | pfn, ret, page->flags); | 1624 | pfn, ret, page->flags); |
1575 | if (ret > 0) | 1625 | if (ret > 0) |
1576 | ret = -EIO; | 1626 | ret = -EIO; |
1627 | } else { | ||
1628 | SetPageHWPoison(page); | ||
1629 | atomic_long_inc(&num_poisoned_pages); | ||
1577 | } | 1630 | } |
1578 | } else { | 1631 | } else { |
1579 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | 1632 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", |
1580 | pfn, ret, page_count(page), page->flags); | 1633 | pfn, ret, page_count(page), page->flags); |
1581 | } | 1634 | } |
1582 | if (ret) | ||
1583 | return ret; | ||
1584 | |||
1585 | done: | ||
1586 | atomic_long_add(1, &mce_bad_pages); | ||
1587 | SetPageHWPoison(page); | ||
1588 | /* keep elevated page count for bad page */ | ||
1589 | return ret; | 1635 | return ret; |
1590 | } | 1636 | } |
diff --git a/mm/memory.c b/mm/memory.c index bb1369f7b9b4..705473afc1f4 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -69,6 +69,10 @@ | |||
69 | 69 | ||
70 | #include "internal.h" | 70 | #include "internal.h" |
71 | 71 | ||
72 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS | ||
73 | #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. | ||
74 | #endif | ||
75 | |||
72 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 76 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
73 | /* use the per-pgdat data instead for discontigmem - mbligh */ | 77 | /* use the per-pgdat data instead for discontigmem - mbligh */ |
74 | unsigned long max_mapnr; | 78 | unsigned long max_mapnr; |
@@ -1458,10 +1462,11 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, | |||
1458 | EXPORT_SYMBOL_GPL(zap_vma_ptes); | 1462 | EXPORT_SYMBOL_GPL(zap_vma_ptes); |
1459 | 1463 | ||
1460 | /** | 1464 | /** |
1461 | * follow_page - look up a page descriptor from a user-virtual address | 1465 | * follow_page_mask - look up a page descriptor from a user-virtual address |
1462 | * @vma: vm_area_struct mapping @address | 1466 | * @vma: vm_area_struct mapping @address |
1463 | * @address: virtual address to look up | 1467 | * @address: virtual address to look up |
1464 | * @flags: flags modifying lookup behaviour | 1468 | * @flags: flags modifying lookup behaviour |
1469 | * @page_mask: on output, *page_mask is set according to the size of the page | ||
1465 | * | 1470 | * |
1466 | * @flags can have FOLL_ flags set, defined in <linux/mm.h> | 1471 | * @flags can have FOLL_ flags set, defined in <linux/mm.h> |
1467 | * | 1472 | * |
@@ -1469,8 +1474,9 @@ EXPORT_SYMBOL_GPL(zap_vma_ptes); | |||
1469 | * an error pointer if there is a mapping to something not represented | 1474 | * an error pointer if there is a mapping to something not represented |
1470 | * by a page descriptor (see also vm_normal_page()). | 1475 | * by a page descriptor (see also vm_normal_page()). |
1471 | */ | 1476 | */ |
1472 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | 1477 | struct page *follow_page_mask(struct vm_area_struct *vma, |
1473 | unsigned int flags) | 1478 | unsigned long address, unsigned int flags, |
1479 | unsigned int *page_mask) | ||
1474 | { | 1480 | { |
1475 | pgd_t *pgd; | 1481 | pgd_t *pgd; |
1476 | pud_t *pud; | 1482 | pud_t *pud; |
@@ -1480,6 +1486,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1480 | struct page *page; | 1486 | struct page *page; |
1481 | struct mm_struct *mm = vma->vm_mm; | 1487 | struct mm_struct *mm = vma->vm_mm; |
1482 | 1488 | ||
1489 | *page_mask = 0; | ||
1490 | |||
1483 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); | 1491 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); |
1484 | if (!IS_ERR(page)) { | 1492 | if (!IS_ERR(page)) { |
1485 | BUG_ON(flags & FOLL_GET); | 1493 | BUG_ON(flags & FOLL_GET); |
@@ -1526,6 +1534,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1526 | page = follow_trans_huge_pmd(vma, address, | 1534 | page = follow_trans_huge_pmd(vma, address, |
1527 | pmd, flags); | 1535 | pmd, flags); |
1528 | spin_unlock(&mm->page_table_lock); | 1536 | spin_unlock(&mm->page_table_lock); |
1537 | *page_mask = HPAGE_PMD_NR - 1; | ||
1529 | goto out; | 1538 | goto out; |
1530 | } | 1539 | } |
1531 | } else | 1540 | } else |
@@ -1539,8 +1548,24 @@ split_fallthrough: | |||
1539 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | 1548 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); |
1540 | 1549 | ||
1541 | pte = *ptep; | 1550 | pte = *ptep; |
1542 | if (!pte_present(pte)) | 1551 | if (!pte_present(pte)) { |
1543 | goto no_page; | 1552 | swp_entry_t entry; |
1553 | /* | ||
1554 | * KSM's break_ksm() relies upon recognizing a ksm page | ||
1555 | * even while it is being migrated, so for that case we | ||
1556 | * need migration_entry_wait(). | ||
1557 | */ | ||
1558 | if (likely(!(flags & FOLL_MIGRATION))) | ||
1559 | goto no_page; | ||
1560 | if (pte_none(pte) || pte_file(pte)) | ||
1561 | goto no_page; | ||
1562 | entry = pte_to_swp_entry(pte); | ||
1563 | if (!is_migration_entry(entry)) | ||
1564 | goto no_page; | ||
1565 | pte_unmap_unlock(ptep, ptl); | ||
1566 | migration_entry_wait(mm, pmd, address); | ||
1567 | goto split_fallthrough; | ||
1568 | } | ||
1544 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | 1569 | if ((flags & FOLL_NUMA) && pte_numa(pte)) |
1545 | goto no_page; | 1570 | goto no_page; |
1546 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 1571 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
@@ -1673,15 +1698,16 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add | |||
1673 | * instead of __get_user_pages. __get_user_pages should be used only if | 1698 | * instead of __get_user_pages. __get_user_pages should be used only if |
1674 | * you need some special @gup_flags. | 1699 | * you need some special @gup_flags. |
1675 | */ | 1700 | */ |
1676 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1701 | long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1677 | unsigned long start, int nr_pages, unsigned int gup_flags, | 1702 | unsigned long start, unsigned long nr_pages, |
1678 | struct page **pages, struct vm_area_struct **vmas, | 1703 | unsigned int gup_flags, struct page **pages, |
1679 | int *nonblocking) | 1704 | struct vm_area_struct **vmas, int *nonblocking) |
1680 | { | 1705 | { |
1681 | int i; | 1706 | long i; |
1682 | unsigned long vm_flags; | 1707 | unsigned long vm_flags; |
1708 | unsigned int page_mask; | ||
1683 | 1709 | ||
1684 | if (nr_pages <= 0) | 1710 | if (!nr_pages) |
1685 | return 0; | 1711 | return 0; |
1686 | 1712 | ||
1687 | VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); | 1713 | VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); |
@@ -1757,6 +1783,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1757 | get_page(page); | 1783 | get_page(page); |
1758 | } | 1784 | } |
1759 | pte_unmap(pte); | 1785 | pte_unmap(pte); |
1786 | page_mask = 0; | ||
1760 | goto next_page; | 1787 | goto next_page; |
1761 | } | 1788 | } |
1762 | 1789 | ||
@@ -1774,6 +1801,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1774 | do { | 1801 | do { |
1775 | struct page *page; | 1802 | struct page *page; |
1776 | unsigned int foll_flags = gup_flags; | 1803 | unsigned int foll_flags = gup_flags; |
1804 | unsigned int page_increm; | ||
1777 | 1805 | ||
1778 | /* | 1806 | /* |
1779 | * If we have a pending SIGKILL, don't keep faulting | 1807 | * If we have a pending SIGKILL, don't keep faulting |
@@ -1783,7 +1811,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1783 | return i ? i : -ERESTARTSYS; | 1811 | return i ? i : -ERESTARTSYS; |
1784 | 1812 | ||
1785 | cond_resched(); | 1813 | cond_resched(); |
1786 | while (!(page = follow_page(vma, start, foll_flags))) { | 1814 | while (!(page = follow_page_mask(vma, start, |
1815 | foll_flags, &page_mask))) { | ||
1787 | int ret; | 1816 | int ret; |
1788 | unsigned int fault_flags = 0; | 1817 | unsigned int fault_flags = 0; |
1789 | 1818 | ||
@@ -1857,13 +1886,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1857 | 1886 | ||
1858 | flush_anon_page(vma, page, start); | 1887 | flush_anon_page(vma, page, start); |
1859 | flush_dcache_page(page); | 1888 | flush_dcache_page(page); |
1889 | page_mask = 0; | ||
1860 | } | 1890 | } |
1861 | next_page: | 1891 | next_page: |
1862 | if (vmas) | 1892 | if (vmas) { |
1863 | vmas[i] = vma; | 1893 | vmas[i] = vma; |
1864 | i++; | 1894 | page_mask = 0; |
1865 | start += PAGE_SIZE; | 1895 | } |
1866 | nr_pages--; | 1896 | page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); |
1897 | if (page_increm > nr_pages) | ||
1898 | page_increm = nr_pages; | ||
1899 | i += page_increm; | ||
1900 | start += page_increm * PAGE_SIZE; | ||
1901 | nr_pages -= page_increm; | ||
1867 | } while (nr_pages && start < vma->vm_end); | 1902 | } while (nr_pages && start < vma->vm_end); |
1868 | } while (nr_pages); | 1903 | } while (nr_pages); |
1869 | return i; | 1904 | return i; |
@@ -1977,9 +2012,9 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
1977 | * | 2012 | * |
1978 | * See also get_user_pages_fast, for performance critical applications. | 2013 | * See also get_user_pages_fast, for performance critical applications. |
1979 | */ | 2014 | */ |
1980 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 2015 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
1981 | unsigned long start, int nr_pages, int write, int force, | 2016 | unsigned long start, unsigned long nr_pages, int write, |
1982 | struct page **pages, struct vm_area_struct **vmas) | 2017 | int force, struct page **pages, struct vm_area_struct **vmas) |
1983 | { | 2018 | { |
1984 | int flags = FOLL_TOUCH; | 2019 | int flags = FOLL_TOUCH; |
1985 | 2020 | ||
@@ -2919,7 +2954,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2919 | unsigned int flags, pte_t orig_pte) | 2954 | unsigned int flags, pte_t orig_pte) |
2920 | { | 2955 | { |
2921 | spinlock_t *ptl; | 2956 | spinlock_t *ptl; |
2922 | struct page *page, *swapcache = NULL; | 2957 | struct page *page, *swapcache; |
2923 | swp_entry_t entry; | 2958 | swp_entry_t entry; |
2924 | pte_t pte; | 2959 | pte_t pte; |
2925 | int locked; | 2960 | int locked; |
@@ -2970,9 +3005,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2970 | */ | 3005 | */ |
2971 | ret = VM_FAULT_HWPOISON; | 3006 | ret = VM_FAULT_HWPOISON; |
2972 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 3007 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
3008 | swapcache = page; | ||
2973 | goto out_release; | 3009 | goto out_release; |
2974 | } | 3010 | } |
2975 | 3011 | ||
3012 | swapcache = page; | ||
2976 | locked = lock_page_or_retry(page, mm, flags); | 3013 | locked = lock_page_or_retry(page, mm, flags); |
2977 | 3014 | ||
2978 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 3015 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
@@ -2990,16 +3027,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2990 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) | 3027 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) |
2991 | goto out_page; | 3028 | goto out_page; |
2992 | 3029 | ||
2993 | if (ksm_might_need_to_copy(page, vma, address)) { | 3030 | page = ksm_might_need_to_copy(page, vma, address); |
2994 | swapcache = page; | 3031 | if (unlikely(!page)) { |
2995 | page = ksm_does_need_to_copy(page, vma, address); | 3032 | ret = VM_FAULT_OOM; |
2996 | 3033 | page = swapcache; | |
2997 | if (unlikely(!page)) { | 3034 | goto out_page; |
2998 | ret = VM_FAULT_OOM; | ||
2999 | page = swapcache; | ||
3000 | swapcache = NULL; | ||
3001 | goto out_page; | ||
3002 | } | ||
3003 | } | 3035 | } |
3004 | 3036 | ||
3005 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { | 3037 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { |
@@ -3044,7 +3076,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3044 | } | 3076 | } |
3045 | flush_icache_page(vma, page); | 3077 | flush_icache_page(vma, page); |
3046 | set_pte_at(mm, address, page_table, pte); | 3078 | set_pte_at(mm, address, page_table, pte); |
3047 | do_page_add_anon_rmap(page, vma, address, exclusive); | 3079 | if (page == swapcache) |
3080 | do_page_add_anon_rmap(page, vma, address, exclusive); | ||
3081 | else /* ksm created a completely new copy */ | ||
3082 | page_add_new_anon_rmap(page, vma, address); | ||
3048 | /* It's better to call commit-charge after rmap is established */ | 3083 | /* It's better to call commit-charge after rmap is established */ |
3049 | mem_cgroup_commit_charge_swapin(page, ptr); | 3084 | mem_cgroup_commit_charge_swapin(page, ptr); |
3050 | 3085 | ||
@@ -3052,7 +3087,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3052 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) | 3087 | if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) |
3053 | try_to_free_swap(page); | 3088 | try_to_free_swap(page); |
3054 | unlock_page(page); | 3089 | unlock_page(page); |
3055 | if (swapcache) { | 3090 | if (page != swapcache) { |
3056 | /* | 3091 | /* |
3057 | * Hold the lock to avoid the swap entry to be reused | 3092 | * Hold the lock to avoid the swap entry to be reused |
3058 | * until we take the PT lock for the pte_same() check | 3093 | * until we take the PT lock for the pte_same() check |
@@ -3085,7 +3120,7 @@ out_page: | |||
3085 | unlock_page(page); | 3120 | unlock_page(page); |
3086 | out_release: | 3121 | out_release: |
3087 | page_cache_release(page); | 3122 | page_cache_release(page); |
3088 | if (swapcache) { | 3123 | if (page != swapcache) { |
3089 | unlock_page(swapcache); | 3124 | unlock_page(swapcache); |
3090 | page_cache_release(swapcache); | 3125 | page_cache_release(swapcache); |
3091 | } | 3126 | } |
@@ -3821,30 +3856,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | |||
3821 | } | 3856 | } |
3822 | #endif /* __PAGETABLE_PMD_FOLDED */ | 3857 | #endif /* __PAGETABLE_PMD_FOLDED */ |
3823 | 3858 | ||
3824 | int make_pages_present(unsigned long addr, unsigned long end) | ||
3825 | { | ||
3826 | int ret, len, write; | ||
3827 | struct vm_area_struct * vma; | ||
3828 | |||
3829 | vma = find_vma(current->mm, addr); | ||
3830 | if (!vma) | ||
3831 | return -ENOMEM; | ||
3832 | /* | ||
3833 | * We want to touch writable mappings with a write fault in order | ||
3834 | * to break COW, except for shared mappings because these don't COW | ||
3835 | * and we would not want to dirty them for nothing. | ||
3836 | */ | ||
3837 | write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; | ||
3838 | BUG_ON(addr >= end); | ||
3839 | BUG_ON(end > vma->vm_end); | ||
3840 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; | ||
3841 | ret = get_user_pages(current, current->mm, addr, | ||
3842 | len, write, 0, NULL, NULL); | ||
3843 | if (ret < 0) | ||
3844 | return ret; | ||
3845 | return ret == len ? 0 : -EFAULT; | ||
3846 | } | ||
3847 | |||
3848 | #if !defined(__HAVE_ARCH_GATE_AREA) | 3859 | #if !defined(__HAVE_ARCH_GATE_AREA) |
3849 | 3860 | ||
3850 | #if defined(AT_SYSINFO_EHDR) | 3861 | #if defined(AT_SYSINFO_EHDR) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d04ed87bfacb..b81a367b9f39 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/suspend.h> | 29 | #include <linux/suspend.h> |
30 | #include <linux/mm_inline.h> | 30 | #include <linux/mm_inline.h> |
31 | #include <linux/firmware-map.h> | 31 | #include <linux/firmware-map.h> |
32 | #include <linux/stop_machine.h> | ||
32 | 33 | ||
33 | #include <asm/tlbflush.h> | 34 | #include <asm/tlbflush.h> |
34 | 35 | ||
@@ -91,9 +92,8 @@ static void release_memory_resource(struct resource *res) | |||
91 | } | 92 | } |
92 | 93 | ||
93 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | 94 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE |
94 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 95 | void get_page_bootmem(unsigned long info, struct page *page, |
95 | static void get_page_bootmem(unsigned long info, struct page *page, | 96 | unsigned long type) |
96 | unsigned long type) | ||
97 | { | 97 | { |
98 | page->lru.next = (struct list_head *) type; | 98 | page->lru.next = (struct list_head *) type; |
99 | SetPagePrivate(page); | 99 | SetPagePrivate(page); |
@@ -124,10 +124,13 @@ void __ref put_page_bootmem(struct page *page) | |||
124 | mutex_lock(&ppb_lock); | 124 | mutex_lock(&ppb_lock); |
125 | __free_pages_bootmem(page, 0); | 125 | __free_pages_bootmem(page, 0); |
126 | mutex_unlock(&ppb_lock); | 126 | mutex_unlock(&ppb_lock); |
127 | totalram_pages++; | ||
127 | } | 128 | } |
128 | 129 | ||
129 | } | 130 | } |
130 | 131 | ||
132 | #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE | ||
133 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | ||
131 | static void register_page_bootmem_info_section(unsigned long start_pfn) | 134 | static void register_page_bootmem_info_section(unsigned long start_pfn) |
132 | { | 135 | { |
133 | unsigned long *usemap, mapsize, section_nr, i; | 136 | unsigned long *usemap, mapsize, section_nr, i; |
@@ -161,6 +164,32 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) | |||
161 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); | 164 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); |
162 | 165 | ||
163 | } | 166 | } |
167 | #else /* CONFIG_SPARSEMEM_VMEMMAP */ | ||
168 | static void register_page_bootmem_info_section(unsigned long start_pfn) | ||
169 | { | ||
170 | unsigned long *usemap, mapsize, section_nr, i; | ||
171 | struct mem_section *ms; | ||
172 | struct page *page, *memmap; | ||
173 | |||
174 | if (!pfn_valid(start_pfn)) | ||
175 | return; | ||
176 | |||
177 | section_nr = pfn_to_section_nr(start_pfn); | ||
178 | ms = __nr_to_section(section_nr); | ||
179 | |||
180 | memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); | ||
181 | |||
182 | register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); | ||
183 | |||
184 | usemap = __nr_to_section(section_nr)->pageblock_flags; | ||
185 | page = virt_to_page(usemap); | ||
186 | |||
187 | mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; | ||
188 | |||
189 | for (i = 0; i < mapsize; i++, page++) | ||
190 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); | ||
191 | } | ||
192 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | ||
164 | 193 | ||
165 | void register_page_bootmem_info_node(struct pglist_data *pgdat) | 194 | void register_page_bootmem_info_node(struct pglist_data *pgdat) |
166 | { | 195 | { |
@@ -189,7 +218,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) | |||
189 | } | 218 | } |
190 | 219 | ||
191 | pfn = pgdat->node_start_pfn; | 220 | pfn = pgdat->node_start_pfn; |
192 | end_pfn = pfn + pgdat->node_spanned_pages; | 221 | end_pfn = pgdat_end_pfn(pgdat); |
193 | 222 | ||
194 | /* register_section info */ | 223 | /* register_section info */ |
195 | for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | 224 | for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
@@ -203,7 +232,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) | |||
203 | register_page_bootmem_info_section(pfn); | 232 | register_page_bootmem_info_section(pfn); |
204 | } | 233 | } |
205 | } | 234 | } |
206 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 235 | #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ |
207 | 236 | ||
208 | static void grow_zone_span(struct zone *zone, unsigned long start_pfn, | 237 | static void grow_zone_span(struct zone *zone, unsigned long start_pfn, |
209 | unsigned long end_pfn) | 238 | unsigned long end_pfn) |
@@ -253,6 +282,17 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, | |||
253 | set_page_links(pfn_to_page(pfn), zid, nid, pfn); | 282 | set_page_links(pfn_to_page(pfn), zid, nid, pfn); |
254 | } | 283 | } |
255 | 284 | ||
285 | /* Can fail with -ENOMEM from allocating a wait table with vmalloc() or | ||
286 | * alloc_bootmem_node_nopanic() */ | ||
287 | static int __ref ensure_zone_is_initialized(struct zone *zone, | ||
288 | unsigned long start_pfn, unsigned long num_pages) | ||
289 | { | ||
290 | if (!zone_is_initialized(zone)) | ||
291 | return init_currently_empty_zone(zone, start_pfn, num_pages, | ||
292 | MEMMAP_HOTPLUG); | ||
293 | return 0; | ||
294 | } | ||
295 | |||
256 | static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | 296 | static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, |
257 | unsigned long start_pfn, unsigned long end_pfn) | 297 | unsigned long start_pfn, unsigned long end_pfn) |
258 | { | 298 | { |
@@ -260,17 +300,14 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | |||
260 | unsigned long flags; | 300 | unsigned long flags; |
261 | unsigned long z1_start_pfn; | 301 | unsigned long z1_start_pfn; |
262 | 302 | ||
263 | if (!z1->wait_table) { | 303 | ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); |
264 | ret = init_currently_empty_zone(z1, start_pfn, | 304 | if (ret) |
265 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | 305 | return ret; |
266 | if (ret) | ||
267 | return ret; | ||
268 | } | ||
269 | 306 | ||
270 | pgdat_resize_lock(z1->zone_pgdat, &flags); | 307 | pgdat_resize_lock(z1->zone_pgdat, &flags); |
271 | 308 | ||
272 | /* can't move pfns which are higher than @z2 */ | 309 | /* can't move pfns which are higher than @z2 */ |
273 | if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) | 310 | if (end_pfn > zone_end_pfn(z2)) |
274 | goto out_fail; | 311 | goto out_fail; |
275 | /* the move out part mast at the left most of @z2 */ | 312 | /* the move out part mast at the left most of @z2 */ |
276 | if (start_pfn > z2->zone_start_pfn) | 313 | if (start_pfn > z2->zone_start_pfn) |
@@ -286,7 +323,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | |||
286 | z1_start_pfn = start_pfn; | 323 | z1_start_pfn = start_pfn; |
287 | 324 | ||
288 | resize_zone(z1, z1_start_pfn, end_pfn); | 325 | resize_zone(z1, z1_start_pfn, end_pfn); |
289 | resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); | 326 | resize_zone(z2, end_pfn, zone_end_pfn(z2)); |
290 | 327 | ||
291 | pgdat_resize_unlock(z1->zone_pgdat, &flags); | 328 | pgdat_resize_unlock(z1->zone_pgdat, &flags); |
292 | 329 | ||
@@ -305,12 +342,9 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, | |||
305 | unsigned long flags; | 342 | unsigned long flags; |
306 | unsigned long z2_end_pfn; | 343 | unsigned long z2_end_pfn; |
307 | 344 | ||
308 | if (!z2->wait_table) { | 345 | ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); |
309 | ret = init_currently_empty_zone(z2, start_pfn, | 346 | if (ret) |
310 | end_pfn - start_pfn, MEMMAP_HOTPLUG); | 347 | return ret; |
311 | if (ret) | ||
312 | return ret; | ||
313 | } | ||
314 | 348 | ||
315 | pgdat_resize_lock(z1->zone_pgdat, &flags); | 349 | pgdat_resize_lock(z1->zone_pgdat, &flags); |
316 | 350 | ||
@@ -318,15 +352,15 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, | |||
318 | if (z1->zone_start_pfn > start_pfn) | 352 | if (z1->zone_start_pfn > start_pfn) |
319 | goto out_fail; | 353 | goto out_fail; |
320 | /* the move out part mast at the right most of @z1 */ | 354 | /* the move out part mast at the right most of @z1 */ |
321 | if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) | 355 | if (zone_end_pfn(z1) > end_pfn) |
322 | goto out_fail; | 356 | goto out_fail; |
323 | /* must included/overlap */ | 357 | /* must included/overlap */ |
324 | if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) | 358 | if (start_pfn >= zone_end_pfn(z1)) |
325 | goto out_fail; | 359 | goto out_fail; |
326 | 360 | ||
327 | /* use end_pfn for z2's end_pfn if z2 is empty */ | 361 | /* use end_pfn for z2's end_pfn if z2 is empty */ |
328 | if (z2->spanned_pages) | 362 | if (z2->spanned_pages) |
329 | z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; | 363 | z2_end_pfn = zone_end_pfn(z2); |
330 | else | 364 | else |
331 | z2_end_pfn = end_pfn; | 365 | z2_end_pfn = end_pfn; |
332 | 366 | ||
@@ -363,16 +397,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) | |||
363 | int nid = pgdat->node_id; | 397 | int nid = pgdat->node_id; |
364 | int zone_type; | 398 | int zone_type; |
365 | unsigned long flags; | 399 | unsigned long flags; |
400 | int ret; | ||
366 | 401 | ||
367 | zone_type = zone - pgdat->node_zones; | 402 | zone_type = zone - pgdat->node_zones; |
368 | if (!zone->wait_table) { | 403 | ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); |
369 | int ret; | 404 | if (ret) |
405 | return ret; | ||
370 | 406 | ||
371 | ret = init_currently_empty_zone(zone, phys_start_pfn, | ||
372 | nr_pages, MEMMAP_HOTPLUG); | ||
373 | if (ret) | ||
374 | return ret; | ||
375 | } | ||
376 | pgdat_resize_lock(zone->zone_pgdat, &flags); | 407 | pgdat_resize_lock(zone->zone_pgdat, &flags); |
377 | grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); | 408 | grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); |
378 | grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, | 409 | grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, |
@@ -405,20 +436,211 @@ static int __meminit __add_section(int nid, struct zone *zone, | |||
405 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); | 436 | return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); |
406 | } | 437 | } |
407 | 438 | ||
408 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 439 | /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ |
409 | static int __remove_section(struct zone *zone, struct mem_section *ms) | 440 | static int find_smallest_section_pfn(int nid, struct zone *zone, |
441 | unsigned long start_pfn, | ||
442 | unsigned long end_pfn) | ||
443 | { | ||
444 | struct mem_section *ms; | ||
445 | |||
446 | for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { | ||
447 | ms = __pfn_to_section(start_pfn); | ||
448 | |||
449 | if (unlikely(!valid_section(ms))) | ||
450 | continue; | ||
451 | |||
452 | if (unlikely(pfn_to_nid(start_pfn) != nid)) | ||
453 | continue; | ||
454 | |||
455 | if (zone && zone != page_zone(pfn_to_page(start_pfn))) | ||
456 | continue; | ||
457 | |||
458 | return start_pfn; | ||
459 | } | ||
460 | |||
461 | return 0; | ||
462 | } | ||
463 | |||
464 | /* find the biggest valid pfn in the range [start_pfn, end_pfn). */ | ||
465 | static int find_biggest_section_pfn(int nid, struct zone *zone, | ||
466 | unsigned long start_pfn, | ||
467 | unsigned long end_pfn) | ||
468 | { | ||
469 | struct mem_section *ms; | ||
470 | unsigned long pfn; | ||
471 | |||
472 | /* pfn is the end pfn of a memory section. */ | ||
473 | pfn = end_pfn - 1; | ||
474 | for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { | ||
475 | ms = __pfn_to_section(pfn); | ||
476 | |||
477 | if (unlikely(!valid_section(ms))) | ||
478 | continue; | ||
479 | |||
480 | if (unlikely(pfn_to_nid(pfn) != nid)) | ||
481 | continue; | ||
482 | |||
483 | if (zone && zone != page_zone(pfn_to_page(pfn))) | ||
484 | continue; | ||
485 | |||
486 | return pfn; | ||
487 | } | ||
488 | |||
489 | return 0; | ||
490 | } | ||
491 | |||
492 | static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, | ||
493 | unsigned long end_pfn) | ||
410 | { | 494 | { |
495 | unsigned long zone_start_pfn = zone->zone_start_pfn; | ||
496 | unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
497 | unsigned long pfn; | ||
498 | struct mem_section *ms; | ||
499 | int nid = zone_to_nid(zone); | ||
500 | |||
501 | zone_span_writelock(zone); | ||
502 | if (zone_start_pfn == start_pfn) { | ||
503 | /* | ||
504 | * If the section is smallest section in the zone, it need | ||
505 | * shrink zone->zone_start_pfn and zone->zone_spanned_pages. | ||
506 | * In this case, we find second smallest valid mem_section | ||
507 | * for shrinking zone. | ||
508 | */ | ||
509 | pfn = find_smallest_section_pfn(nid, zone, end_pfn, | ||
510 | zone_end_pfn); | ||
511 | if (pfn) { | ||
512 | zone->zone_start_pfn = pfn; | ||
513 | zone->spanned_pages = zone_end_pfn - pfn; | ||
514 | } | ||
515 | } else if (zone_end_pfn == end_pfn) { | ||
516 | /* | ||
517 | * If the section is biggest section in the zone, it need | ||
518 | * shrink zone->spanned_pages. | ||
519 | * In this case, we find second biggest valid mem_section for | ||
520 | * shrinking zone. | ||
521 | */ | ||
522 | pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn, | ||
523 | start_pfn); | ||
524 | if (pfn) | ||
525 | zone->spanned_pages = pfn - zone_start_pfn + 1; | ||
526 | } | ||
527 | |||
411 | /* | 528 | /* |
412 | * XXX: Freeing memmap with vmemmap is not implement yet. | 529 | * The section is not biggest or smallest mem_section in the zone, it |
413 | * This should be removed later. | 530 | * only creates a hole in the zone. So in this case, we need not |
531 | * change the zone. But perhaps, the zone has only hole data. Thus | ||
532 | * it check the zone has only hole or not. | ||
414 | */ | 533 | */ |
415 | return -EBUSY; | 534 | pfn = zone_start_pfn; |
535 | for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { | ||
536 | ms = __pfn_to_section(pfn); | ||
537 | |||
538 | if (unlikely(!valid_section(ms))) | ||
539 | continue; | ||
540 | |||
541 | if (page_zone(pfn_to_page(pfn)) != zone) | ||
542 | continue; | ||
543 | |||
544 | /* If the section is current section, it continues the loop */ | ||
545 | if (start_pfn == pfn) | ||
546 | continue; | ||
547 | |||
548 | /* If we find valid section, we have nothing to do */ | ||
549 | zone_span_writeunlock(zone); | ||
550 | return; | ||
551 | } | ||
552 | |||
553 | /* The zone has no valid section */ | ||
554 | zone->zone_start_pfn = 0; | ||
555 | zone->spanned_pages = 0; | ||
556 | zone_span_writeunlock(zone); | ||
416 | } | 557 | } |
417 | #else | 558 | |
418 | static int __remove_section(struct zone *zone, struct mem_section *ms) | 559 | static void shrink_pgdat_span(struct pglist_data *pgdat, |
560 | unsigned long start_pfn, unsigned long end_pfn) | ||
561 | { | ||
562 | unsigned long pgdat_start_pfn = pgdat->node_start_pfn; | ||
563 | unsigned long pgdat_end_pfn = | ||
564 | pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
565 | unsigned long pfn; | ||
566 | struct mem_section *ms; | ||
567 | int nid = pgdat->node_id; | ||
568 | |||
569 | if (pgdat_start_pfn == start_pfn) { | ||
570 | /* | ||
571 | * If the section is smallest section in the pgdat, it need | ||
572 | * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages. | ||
573 | * In this case, we find second smallest valid mem_section | ||
574 | * for shrinking zone. | ||
575 | */ | ||
576 | pfn = find_smallest_section_pfn(nid, NULL, end_pfn, | ||
577 | pgdat_end_pfn); | ||
578 | if (pfn) { | ||
579 | pgdat->node_start_pfn = pfn; | ||
580 | pgdat->node_spanned_pages = pgdat_end_pfn - pfn; | ||
581 | } | ||
582 | } else if (pgdat_end_pfn == end_pfn) { | ||
583 | /* | ||
584 | * If the section is biggest section in the pgdat, it need | ||
585 | * shrink pgdat->node_spanned_pages. | ||
586 | * In this case, we find second biggest valid mem_section for | ||
587 | * shrinking zone. | ||
588 | */ | ||
589 | pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn, | ||
590 | start_pfn); | ||
591 | if (pfn) | ||
592 | pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1; | ||
593 | } | ||
594 | |||
595 | /* | ||
596 | * If the section is not biggest or smallest mem_section in the pgdat, | ||
597 | * it only creates a hole in the pgdat. So in this case, we need not | ||
598 | * change the pgdat. | ||
599 | * But perhaps, the pgdat has only hole data. Thus it check the pgdat | ||
600 | * has only hole or not. | ||
601 | */ | ||
602 | pfn = pgdat_start_pfn; | ||
603 | for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { | ||
604 | ms = __pfn_to_section(pfn); | ||
605 | |||
606 | if (unlikely(!valid_section(ms))) | ||
607 | continue; | ||
608 | |||
609 | if (pfn_to_nid(pfn) != nid) | ||
610 | continue; | ||
611 | |||
612 | /* If the section is current section, it continues the loop */ | ||
613 | if (start_pfn == pfn) | ||
614 | continue; | ||
615 | |||
616 | /* If we find valid section, we have nothing to do */ | ||
617 | return; | ||
618 | } | ||
619 | |||
620 | /* The pgdat has no valid section */ | ||
621 | pgdat->node_start_pfn = 0; | ||
622 | pgdat->node_spanned_pages = 0; | ||
623 | } | ||
624 | |||
625 | static void __remove_zone(struct zone *zone, unsigned long start_pfn) | ||
419 | { | 626 | { |
420 | unsigned long flags; | ||
421 | struct pglist_data *pgdat = zone->zone_pgdat; | 627 | struct pglist_data *pgdat = zone->zone_pgdat; |
628 | int nr_pages = PAGES_PER_SECTION; | ||
629 | int zone_type; | ||
630 | unsigned long flags; | ||
631 | |||
632 | zone_type = zone - pgdat->node_zones; | ||
633 | |||
634 | pgdat_resize_lock(zone->zone_pgdat, &flags); | ||
635 | shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); | ||
636 | shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); | ||
637 | pgdat_resize_unlock(zone->zone_pgdat, &flags); | ||
638 | } | ||
639 | |||
640 | static int __remove_section(struct zone *zone, struct mem_section *ms) | ||
641 | { | ||
642 | unsigned long start_pfn; | ||
643 | int scn_nr; | ||
422 | int ret = -EINVAL; | 644 | int ret = -EINVAL; |
423 | 645 | ||
424 | if (!valid_section(ms)) | 646 | if (!valid_section(ms)) |
@@ -428,12 +650,13 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) | |||
428 | if (ret) | 650 | if (ret) |
429 | return ret; | 651 | return ret; |
430 | 652 | ||
431 | pgdat_resize_lock(pgdat, &flags); | 653 | scn_nr = __section_nr(ms); |
654 | start_pfn = section_nr_to_pfn(scn_nr); | ||
655 | __remove_zone(zone, start_pfn); | ||
656 | |||
432 | sparse_remove_one_section(zone, ms); | 657 | sparse_remove_one_section(zone, ms); |
433 | pgdat_resize_unlock(pgdat, &flags); | ||
434 | return 0; | 658 | return 0; |
435 | } | 659 | } |
436 | #endif | ||
437 | 660 | ||
438 | /* | 661 | /* |
439 | * Reasonably generic function for adding memory. It is | 662 | * Reasonably generic function for adding memory. It is |
@@ -797,11 +1020,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
797 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; | 1020 | unsigned long zholes_size[MAX_NR_ZONES] = {0}; |
798 | unsigned long start_pfn = start >> PAGE_SHIFT; | 1021 | unsigned long start_pfn = start >> PAGE_SHIFT; |
799 | 1022 | ||
800 | pgdat = arch_alloc_nodedata(nid); | 1023 | pgdat = NODE_DATA(nid); |
801 | if (!pgdat) | 1024 | if (!pgdat) { |
802 | return NULL; | 1025 | pgdat = arch_alloc_nodedata(nid); |
1026 | if (!pgdat) | ||
1027 | return NULL; | ||
803 | 1028 | ||
804 | arch_refresh_nodedata(nid, pgdat); | 1029 | arch_refresh_nodedata(nid, pgdat); |
1030 | } | ||
805 | 1031 | ||
806 | /* we can use NODE_DATA(nid) from here */ | 1032 | /* we can use NODE_DATA(nid) from here */ |
807 | 1033 | ||
@@ -854,7 +1080,8 @@ out: | |||
854 | int __ref add_memory(int nid, u64 start, u64 size) | 1080 | int __ref add_memory(int nid, u64 start, u64 size) |
855 | { | 1081 | { |
856 | pg_data_t *pgdat = NULL; | 1082 | pg_data_t *pgdat = NULL; |
857 | int new_pgdat = 0; | 1083 | bool new_pgdat; |
1084 | bool new_node; | ||
858 | struct resource *res; | 1085 | struct resource *res; |
859 | int ret; | 1086 | int ret; |
860 | 1087 | ||
@@ -865,12 +1092,16 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
865 | if (!res) | 1092 | if (!res) |
866 | goto out; | 1093 | goto out; |
867 | 1094 | ||
868 | if (!node_online(nid)) { | 1095 | { /* Stupid hack to suppress address-never-null warning */ |
1096 | void *p = NODE_DATA(nid); | ||
1097 | new_pgdat = !p; | ||
1098 | } | ||
1099 | new_node = !node_online(nid); | ||
1100 | if (new_node) { | ||
869 | pgdat = hotadd_new_pgdat(nid, start); | 1101 | pgdat = hotadd_new_pgdat(nid, start); |
870 | ret = -ENOMEM; | 1102 | ret = -ENOMEM; |
871 | if (!pgdat) | 1103 | if (!pgdat) |
872 | goto error; | 1104 | goto error; |
873 | new_pgdat = 1; | ||
874 | } | 1105 | } |
875 | 1106 | ||
876 | /* call arch's memory hotadd */ | 1107 | /* call arch's memory hotadd */ |
@@ -882,7 +1113,7 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
882 | /* we online node here. we can't roll back from here. */ | 1113 | /* we online node here. we can't roll back from here. */ |
883 | node_set_online(nid); | 1114 | node_set_online(nid); |
884 | 1115 | ||
885 | if (new_pgdat) { | 1116 | if (new_node) { |
886 | ret = register_one_node(nid); | 1117 | ret = register_one_node(nid); |
887 | /* | 1118 | /* |
888 | * If sysfs file of new node can't create, cpu on the node | 1119 | * If sysfs file of new node can't create, cpu on the node |
@@ -901,8 +1132,7 @@ error: | |||
901 | /* rollback pgdat allocation and others */ | 1132 | /* rollback pgdat allocation and others */ |
902 | if (new_pgdat) | 1133 | if (new_pgdat) |
903 | rollback_node_hotadd(nid, pgdat); | 1134 | rollback_node_hotadd(nid, pgdat); |
904 | if (res) | 1135 | release_memory_resource(res); |
905 | release_memory_resource(res); | ||
906 | 1136 | ||
907 | out: | 1137 | out: |
908 | unlock_memory_hotplug(); | 1138 | unlock_memory_hotplug(); |
@@ -1058,8 +1288,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1058 | * migrate_pages returns # of failed pages. | 1288 | * migrate_pages returns # of failed pages. |
1059 | */ | 1289 | */ |
1060 | ret = migrate_pages(&source, alloc_migrate_target, 0, | 1290 | ret = migrate_pages(&source, alloc_migrate_target, 0, |
1061 | true, MIGRATE_SYNC, | 1291 | MIGRATE_SYNC, MR_MEMORY_HOTPLUG); |
1062 | MR_MEMORY_HOTPLUG); | ||
1063 | if (ret) | 1292 | if (ret) |
1064 | putback_lru_pages(&source); | 1293 | putback_lru_pages(&source); |
1065 | } | 1294 | } |
@@ -1381,17 +1610,26 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | |||
1381 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); | 1610 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); |
1382 | } | 1611 | } |
1383 | 1612 | ||
1384 | int remove_memory(u64 start, u64 size) | 1613 | /** |
1614 | * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) | ||
1615 | * @start_pfn: start pfn of the memory range | ||
1616 | * @end_pfn: end pft of the memory range | ||
1617 | * @arg: argument passed to func | ||
1618 | * @func: callback for each memory section walked | ||
1619 | * | ||
1620 | * This function walks through all present mem sections in range | ||
1621 | * [start_pfn, end_pfn) and call func on each mem section. | ||
1622 | * | ||
1623 | * Returns the return value of func. | ||
1624 | */ | ||
1625 | static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, | ||
1626 | void *arg, int (*func)(struct memory_block *, void *)) | ||
1385 | { | 1627 | { |
1386 | struct memory_block *mem = NULL; | 1628 | struct memory_block *mem = NULL; |
1387 | struct mem_section *section; | 1629 | struct mem_section *section; |
1388 | unsigned long start_pfn, end_pfn; | ||
1389 | unsigned long pfn, section_nr; | 1630 | unsigned long pfn, section_nr; |
1390 | int ret; | 1631 | int ret; |
1391 | 1632 | ||
1392 | start_pfn = PFN_DOWN(start); | ||
1393 | end_pfn = start_pfn + PFN_DOWN(size); | ||
1394 | |||
1395 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | 1633 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
1396 | section_nr = pfn_to_section_nr(pfn); | 1634 | section_nr = pfn_to_section_nr(pfn); |
1397 | if (!present_section_nr(section_nr)) | 1635 | if (!present_section_nr(section_nr)) |
@@ -1408,7 +1646,7 @@ int remove_memory(u64 start, u64 size) | |||
1408 | if (!mem) | 1646 | if (!mem) |
1409 | continue; | 1647 | continue; |
1410 | 1648 | ||
1411 | ret = offline_memory_block(mem); | 1649 | ret = func(mem, arg); |
1412 | if (ret) { | 1650 | if (ret) { |
1413 | kobject_put(&mem->dev.kobj); | 1651 | kobject_put(&mem->dev.kobj); |
1414 | return ret; | 1652 | return ret; |
@@ -1420,12 +1658,209 @@ int remove_memory(u64 start, u64 size) | |||
1420 | 1658 | ||
1421 | return 0; | 1659 | return 0; |
1422 | } | 1660 | } |
1661 | |||
1662 | /** | ||
1663 | * offline_memory_block_cb - callback function for offlining memory block | ||
1664 | * @mem: the memory block to be offlined | ||
1665 | * @arg: buffer to hold error msg | ||
1666 | * | ||
1667 | * Always return 0, and put the error msg in arg if any. | ||
1668 | */ | ||
1669 | static int offline_memory_block_cb(struct memory_block *mem, void *arg) | ||
1670 | { | ||
1671 | int *ret = arg; | ||
1672 | int error = offline_memory_block(mem); | ||
1673 | |||
1674 | if (error != 0 && *ret == 0) | ||
1675 | *ret = error; | ||
1676 | |||
1677 | return 0; | ||
1678 | } | ||
1679 | |||
1680 | static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) | ||
1681 | { | ||
1682 | int ret = !is_memblock_offlined(mem); | ||
1683 | |||
1684 | if (unlikely(ret)) | ||
1685 | pr_warn("removing memory fails, because memory " | ||
1686 | "[%#010llx-%#010llx] is onlined\n", | ||
1687 | PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)), | ||
1688 | PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1); | ||
1689 | |||
1690 | return ret; | ||
1691 | } | ||
1692 | |||
1693 | static int check_cpu_on_node(void *data) | ||
1694 | { | ||
1695 | struct pglist_data *pgdat = data; | ||
1696 | int cpu; | ||
1697 | |||
1698 | for_each_present_cpu(cpu) { | ||
1699 | if (cpu_to_node(cpu) == pgdat->node_id) | ||
1700 | /* | ||
1701 | * the cpu on this node isn't removed, and we can't | ||
1702 | * offline this node. | ||
1703 | */ | ||
1704 | return -EBUSY; | ||
1705 | } | ||
1706 | |||
1707 | return 0; | ||
1708 | } | ||
1709 | |||
1710 | static void unmap_cpu_on_node(void *data) | ||
1711 | { | ||
1712 | #ifdef CONFIG_ACPI_NUMA | ||
1713 | struct pglist_data *pgdat = data; | ||
1714 | int cpu; | ||
1715 | |||
1716 | for_each_possible_cpu(cpu) | ||
1717 | if (cpu_to_node(cpu) == pgdat->node_id) | ||
1718 | numa_clear_node(cpu); | ||
1719 | #endif | ||
1720 | } | ||
1721 | |||
1722 | static int check_and_unmap_cpu_on_node(void *data) | ||
1723 | { | ||
1724 | int ret = check_cpu_on_node(data); | ||
1725 | |||
1726 | if (ret) | ||
1727 | return ret; | ||
1728 | |||
1729 | /* | ||
1730 | * the node will be offlined when we come here, so we can clear | ||
1731 | * the cpu_to_node() now. | ||
1732 | */ | ||
1733 | |||
1734 | unmap_cpu_on_node(data); | ||
1735 | return 0; | ||
1736 | } | ||
1737 | |||
1738 | /* offline the node if all memory sections of this node are removed */ | ||
1739 | void try_offline_node(int nid) | ||
1740 | { | ||
1741 | pg_data_t *pgdat = NODE_DATA(nid); | ||
1742 | unsigned long start_pfn = pgdat->node_start_pfn; | ||
1743 | unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; | ||
1744 | unsigned long pfn; | ||
1745 | struct page *pgdat_page = virt_to_page(pgdat); | ||
1746 | int i; | ||
1747 | |||
1748 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | ||
1749 | unsigned long section_nr = pfn_to_section_nr(pfn); | ||
1750 | |||
1751 | if (!present_section_nr(section_nr)) | ||
1752 | continue; | ||
1753 | |||
1754 | if (pfn_to_nid(pfn) != nid) | ||
1755 | continue; | ||
1756 | |||
1757 | /* | ||
1758 | * some memory sections of this node are not removed, and we | ||
1759 | * can't offline node now. | ||
1760 | */ | ||
1761 | return; | ||
1762 | } | ||
1763 | |||
1764 | if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) | ||
1765 | return; | ||
1766 | |||
1767 | /* | ||
1768 | * all memory/cpu of this node are removed, we can offline this | ||
1769 | * node now. | ||
1770 | */ | ||
1771 | node_set_offline(nid); | ||
1772 | unregister_one_node(nid); | ||
1773 | |||
1774 | if (!PageSlab(pgdat_page) && !PageCompound(pgdat_page)) | ||
1775 | /* node data is allocated from boot memory */ | ||
1776 | return; | ||
1777 | |||
1778 | /* free waittable in each zone */ | ||
1779 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1780 | struct zone *zone = pgdat->node_zones + i; | ||
1781 | |||
1782 | if (zone->wait_table) | ||
1783 | vfree(zone->wait_table); | ||
1784 | } | ||
1785 | |||
1786 | /* | ||
1787 | * Since there is no way to guarentee the address of pgdat/zone is not | ||
1788 | * on stack of any kernel threads or used by other kernel objects | ||
1789 | * without reference counting or other symchronizing method, do not | ||
1790 | * reset node_data and free pgdat here. Just reset it to 0 and reuse | ||
1791 | * the memory when the node is online again. | ||
1792 | */ | ||
1793 | memset(pgdat, 0, sizeof(*pgdat)); | ||
1794 | } | ||
1795 | EXPORT_SYMBOL(try_offline_node); | ||
1796 | |||
1797 | int __ref remove_memory(int nid, u64 start, u64 size) | ||
1798 | { | ||
1799 | unsigned long start_pfn, end_pfn; | ||
1800 | int ret = 0; | ||
1801 | int retry = 1; | ||
1802 | |||
1803 | start_pfn = PFN_DOWN(start); | ||
1804 | end_pfn = start_pfn + PFN_DOWN(size); | ||
1805 | |||
1806 | /* | ||
1807 | * When CONFIG_MEMCG is on, one memory block may be used by other | ||
1808 | * blocks to store page cgroup when onlining pages. But we don't know | ||
1809 | * in what order pages are onlined. So we iterate twice to offline | ||
1810 | * memory: | ||
1811 | * 1st iterate: offline every non primary memory block. | ||
1812 | * 2nd iterate: offline primary (i.e. first added) memory block. | ||
1813 | */ | ||
1814 | repeat: | ||
1815 | walk_memory_range(start_pfn, end_pfn, &ret, | ||
1816 | offline_memory_block_cb); | ||
1817 | if (ret) { | ||
1818 | if (!retry) | ||
1819 | return ret; | ||
1820 | |||
1821 | retry = 0; | ||
1822 | ret = 0; | ||
1823 | goto repeat; | ||
1824 | } | ||
1825 | |||
1826 | lock_memory_hotplug(); | ||
1827 | |||
1828 | /* | ||
1829 | * we have offlined all memory blocks like this: | ||
1830 | * 1. lock memory hotplug | ||
1831 | * 2. offline a memory block | ||
1832 | * 3. unlock memory hotplug | ||
1833 | * | ||
1834 | * repeat step1-3 to offline the memory block. All memory blocks | ||
1835 | * must be offlined before removing memory. But we don't hold the | ||
1836 | * lock in the whole operation. So we should check whether all | ||
1837 | * memory blocks are offlined. | ||
1838 | */ | ||
1839 | |||
1840 | ret = walk_memory_range(start_pfn, end_pfn, NULL, | ||
1841 | is_memblock_offlined_cb); | ||
1842 | if (ret) { | ||
1843 | unlock_memory_hotplug(); | ||
1844 | return ret; | ||
1845 | } | ||
1846 | |||
1847 | /* remove memmap entry */ | ||
1848 | firmware_map_remove(start, start + size, "System RAM"); | ||
1849 | |||
1850 | arch_remove_memory(start, size); | ||
1851 | |||
1852 | try_offline_node(nid); | ||
1853 | |||
1854 | unlock_memory_hotplug(); | ||
1855 | |||
1856 | return 0; | ||
1857 | } | ||
1423 | #else | 1858 | #else |
1424 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | 1859 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) |
1425 | { | 1860 | { |
1426 | return -EINVAL; | 1861 | return -EINVAL; |
1427 | } | 1862 | } |
1428 | int remove_memory(u64 start, u64 size) | 1863 | int remove_memory(int nid, u64 start, u64 size) |
1429 | { | 1864 | { |
1430 | return -EINVAL; | 1865 | return -EINVAL; |
1431 | } | 1866 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e2df1c1fb41f..31d26637b658 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -26,7 +26,7 @@ | |||
26 | * the allocation to memory nodes instead | 26 | * the allocation to memory nodes instead |
27 | * | 27 | * |
28 | * preferred Try a specific node first before normal fallback. | 28 | * preferred Try a specific node first before normal fallback. |
29 | * As a special case node -1 here means do the allocation | 29 | * As a special case NUMA_NO_NODE here means do the allocation |
30 | * on the local CPU. This is normally identical to default, | 30 | * on the local CPU. This is normally identical to default, |
31 | * but useful to set in a VMA when you have a non default | 31 | * but useful to set in a VMA when you have a non default |
32 | * process policy. | 32 | * process policy. |
@@ -127,7 +127,7 @@ static struct mempolicy *get_task_policy(struct task_struct *p) | |||
127 | 127 | ||
128 | if (!pol) { | 128 | if (!pol) { |
129 | node = numa_node_id(); | 129 | node = numa_node_id(); |
130 | if (node != -1) | 130 | if (node != NUMA_NO_NODE) |
131 | pol = &preferred_node_policy[node]; | 131 | pol = &preferred_node_policy[node]; |
132 | 132 | ||
133 | /* preferred_node_policy is not initialised early in boot */ | 133 | /* preferred_node_policy is not initialised early in boot */ |
@@ -161,19 +161,7 @@ static const struct mempolicy_operations { | |||
161 | /* Check that the nodemask contains at least one populated zone */ | 161 | /* Check that the nodemask contains at least one populated zone */ |
162 | static int is_valid_nodemask(const nodemask_t *nodemask) | 162 | static int is_valid_nodemask(const nodemask_t *nodemask) |
163 | { | 163 | { |
164 | int nd, k; | 164 | return nodes_intersects(*nodemask, node_states[N_MEMORY]); |
165 | |||
166 | for_each_node_mask(nd, *nodemask) { | ||
167 | struct zone *z; | ||
168 | |||
169 | for (k = 0; k <= policy_zone; k++) { | ||
170 | z = &NODE_DATA(nd)->node_zones[k]; | ||
171 | if (z->present_pages > 0) | ||
172 | return 1; | ||
173 | } | ||
174 | } | ||
175 | |||
176 | return 0; | ||
177 | } | 165 | } |
178 | 166 | ||
179 | static inline int mpol_store_user_nodemask(const struct mempolicy *pol) | 167 | static inline int mpol_store_user_nodemask(const struct mempolicy *pol) |
@@ -270,7 +258,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
270 | struct mempolicy *policy; | 258 | struct mempolicy *policy; |
271 | 259 | ||
272 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", | 260 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", |
273 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); | 261 | mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); |
274 | 262 | ||
275 | if (mode == MPOL_DEFAULT) { | 263 | if (mode == MPOL_DEFAULT) { |
276 | if (nodes && !nodes_empty(*nodes)) | 264 | if (nodes && !nodes_empty(*nodes)) |
@@ -508,9 +496,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
508 | /* | 496 | /* |
509 | * vm_normal_page() filters out zero pages, but there might | 497 | * vm_normal_page() filters out zero pages, but there might |
510 | * still be PageReserved pages to skip, perhaps in a VDSO. | 498 | * still be PageReserved pages to skip, perhaps in a VDSO. |
511 | * And we cannot move PageKsm pages sensibly or safely yet. | ||
512 | */ | 499 | */ |
513 | if (PageReserved(page) || PageKsm(page)) | 500 | if (PageReserved(page)) |
514 | continue; | 501 | continue; |
515 | nid = page_to_nid(page); | 502 | nid = page_to_nid(page); |
516 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 503 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
@@ -1027,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
1027 | 1014 | ||
1028 | if (!list_empty(&pagelist)) { | 1015 | if (!list_empty(&pagelist)) { |
1029 | err = migrate_pages(&pagelist, new_node_page, dest, | 1016 | err = migrate_pages(&pagelist, new_node_page, dest, |
1030 | false, MIGRATE_SYNC, | 1017 | MIGRATE_SYNC, MR_SYSCALL); |
1031 | MR_SYSCALL); | ||
1032 | if (err) | 1018 | if (err) |
1033 | putback_lru_pages(&pagelist); | 1019 | putback_lru_pages(&pagelist); |
1034 | } | 1020 | } |
@@ -1235,7 +1221,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1235 | 1221 | ||
1236 | pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", | 1222 | pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", |
1237 | start, start + len, mode, mode_flags, | 1223 | start, start + len, mode, mode_flags, |
1238 | nmask ? nodes_addr(*nmask)[0] : -1); | 1224 | nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); |
1239 | 1225 | ||
1240 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { | 1226 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { |
1241 | 1227 | ||
@@ -1272,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1272 | if (!list_empty(&pagelist)) { | 1258 | if (!list_empty(&pagelist)) { |
1273 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); | 1259 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); |
1274 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1260 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1275 | (unsigned long)vma, | 1261 | (unsigned long)vma, |
1276 | false, MIGRATE_SYNC, | 1262 | MIGRATE_SYNC, MR_MEMPOLICY_MBIND); |
1277 | MR_MEMPOLICY_MBIND); | ||
1278 | if (nr_failed) | 1263 | if (nr_failed) |
1279 | putback_lru_pages(&pagelist); | 1264 | putback_lru_pages(&pagelist); |
1280 | } | 1265 | } |
@@ -1644,6 +1629,26 @@ struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1644 | return pol; | 1629 | return pol; |
1645 | } | 1630 | } |
1646 | 1631 | ||
1632 | static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) | ||
1633 | { | ||
1634 | enum zone_type dynamic_policy_zone = policy_zone; | ||
1635 | |||
1636 | BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); | ||
1637 | |||
1638 | /* | ||
1639 | * if policy->v.nodes has movable memory only, | ||
1640 | * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. | ||
1641 | * | ||
1642 | * policy->v.nodes is intersect with node_states[N_MEMORY]. | ||
1643 | * so if the following test faile, it implies | ||
1644 | * policy->v.nodes has movable memory only. | ||
1645 | */ | ||
1646 | if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) | ||
1647 | dynamic_policy_zone = ZONE_MOVABLE; | ||
1648 | |||
1649 | return zone >= dynamic_policy_zone; | ||
1650 | } | ||
1651 | |||
1647 | /* | 1652 | /* |
1648 | * Return a nodemask representing a mempolicy for filtering nodes for | 1653 | * Return a nodemask representing a mempolicy for filtering nodes for |
1649 | * page allocation | 1654 | * page allocation |
@@ -1652,7 +1657,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) | |||
1652 | { | 1657 | { |
1653 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ | 1658 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ |
1654 | if (unlikely(policy->mode == MPOL_BIND) && | 1659 | if (unlikely(policy->mode == MPOL_BIND) && |
1655 | gfp_zone(gfp) >= policy_zone && | 1660 | apply_policy_zone(policy, gfp_zone(gfp)) && |
1656 | cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) | 1661 | cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) |
1657 | return &policy->v.nodes; | 1662 | return &policy->v.nodes; |
1658 | 1663 | ||
@@ -2308,7 +2313,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
2308 | * it less likely we act on an unlikely task<->page | 2313 | * it less likely we act on an unlikely task<->page |
2309 | * relation. | 2314 | * relation. |
2310 | */ | 2315 | */ |
2311 | last_nid = page_xchg_last_nid(page, polnid); | 2316 | last_nid = page_nid_xchg_last(page, polnid); |
2312 | if (last_nid != polnid) | 2317 | if (last_nid != polnid) |
2313 | goto out; | 2318 | goto out; |
2314 | } | 2319 | } |
@@ -2483,7 +2488,7 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
2483 | vma->vm_pgoff, | 2488 | vma->vm_pgoff, |
2484 | sz, npol ? npol->mode : -1, | 2489 | sz, npol ? npol->mode : -1, |
2485 | npol ? npol->flags : -1, | 2490 | npol ? npol->flags : -1, |
2486 | npol ? nodes_addr(npol->v.nodes)[0] : -1); | 2491 | npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); |
2487 | 2492 | ||
2488 | if (npol) { | 2493 | if (npol) { |
2489 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); | 2494 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); |
diff --git a/mm/migrate.c b/mm/migrate.c index 2fd8b4af4744..3bbaf5d230b0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -464,7 +464,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
464 | 464 | ||
465 | mlock_migrate_page(newpage, page); | 465 | mlock_migrate_page(newpage, page); |
466 | ksm_migrate_page(newpage, page); | 466 | ksm_migrate_page(newpage, page); |
467 | 467 | /* | |
468 | * Please do not reorder this without considering how mm/ksm.c's | ||
469 | * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). | ||
470 | */ | ||
468 | ClearPageSwapCache(page); | 471 | ClearPageSwapCache(page); |
469 | ClearPagePrivate(page); | 472 | ClearPagePrivate(page); |
470 | set_page_private(page, 0); | 473 | set_page_private(page, 0); |
@@ -698,7 +701,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
698 | } | 701 | } |
699 | 702 | ||
700 | static int __unmap_and_move(struct page *page, struct page *newpage, | 703 | static int __unmap_and_move(struct page *page, struct page *newpage, |
701 | int force, bool offlining, enum migrate_mode mode) | 704 | int force, enum migrate_mode mode) |
702 | { | 705 | { |
703 | int rc = -EAGAIN; | 706 | int rc = -EAGAIN; |
704 | int remap_swapcache = 1; | 707 | int remap_swapcache = 1; |
@@ -728,20 +731,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
728 | lock_page(page); | 731 | lock_page(page); |
729 | } | 732 | } |
730 | 733 | ||
731 | /* | ||
732 | * Only memory hotplug's offline_pages() caller has locked out KSM, | ||
733 | * and can safely migrate a KSM page. The other cases have skipped | ||
734 | * PageKsm along with PageReserved - but it is only now when we have | ||
735 | * the page lock that we can be certain it will not go KSM beneath us | ||
736 | * (KSM will not upgrade a page from PageAnon to PageKsm when it sees | ||
737 | * its pagecount raised, but only here do we take the page lock which | ||
738 | * serializes that). | ||
739 | */ | ||
740 | if (PageKsm(page) && !offlining) { | ||
741 | rc = -EBUSY; | ||
742 | goto unlock; | ||
743 | } | ||
744 | |||
745 | /* charge against new page */ | 734 | /* charge against new page */ |
746 | mem_cgroup_prepare_migration(page, newpage, &mem); | 735 | mem_cgroup_prepare_migration(page, newpage, &mem); |
747 | 736 | ||
@@ -768,7 +757,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
768 | * File Caches may use write_page() or lock_page() in migration, then, | 757 | * File Caches may use write_page() or lock_page() in migration, then, |
769 | * just care Anon page here. | 758 | * just care Anon page here. |
770 | */ | 759 | */ |
771 | if (PageAnon(page)) { | 760 | if (PageAnon(page) && !PageKsm(page)) { |
772 | /* | 761 | /* |
773 | * Only page_lock_anon_vma_read() understands the subtleties of | 762 | * Only page_lock_anon_vma_read() understands the subtleties of |
774 | * getting a hold on an anon_vma from outside one of its mms. | 763 | * getting a hold on an anon_vma from outside one of its mms. |
@@ -848,7 +837,6 @@ uncharge: | |||
848 | mem_cgroup_end_migration(mem, page, newpage, | 837 | mem_cgroup_end_migration(mem, page, newpage, |
849 | (rc == MIGRATEPAGE_SUCCESS || | 838 | (rc == MIGRATEPAGE_SUCCESS || |
850 | rc == MIGRATEPAGE_BALLOON_SUCCESS)); | 839 | rc == MIGRATEPAGE_BALLOON_SUCCESS)); |
851 | unlock: | ||
852 | unlock_page(page); | 840 | unlock_page(page); |
853 | out: | 841 | out: |
854 | return rc; | 842 | return rc; |
@@ -859,8 +847,7 @@ out: | |||
859 | * to the newly allocated page in newpage. | 847 | * to the newly allocated page in newpage. |
860 | */ | 848 | */ |
861 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 849 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, |
862 | struct page *page, int force, bool offlining, | 850 | struct page *page, int force, enum migrate_mode mode) |
863 | enum migrate_mode mode) | ||
864 | { | 851 | { |
865 | int rc = 0; | 852 | int rc = 0; |
866 | int *result = NULL; | 853 | int *result = NULL; |
@@ -878,7 +865,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
878 | if (unlikely(split_huge_page(page))) | 865 | if (unlikely(split_huge_page(page))) |
879 | goto out; | 866 | goto out; |
880 | 867 | ||
881 | rc = __unmap_and_move(page, newpage, force, offlining, mode); | 868 | rc = __unmap_and_move(page, newpage, force, mode); |
882 | 869 | ||
883 | if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { | 870 | if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { |
884 | /* | 871 | /* |
@@ -938,8 +925,7 @@ out: | |||
938 | */ | 925 | */ |
939 | static int unmap_and_move_huge_page(new_page_t get_new_page, | 926 | static int unmap_and_move_huge_page(new_page_t get_new_page, |
940 | unsigned long private, struct page *hpage, | 927 | unsigned long private, struct page *hpage, |
941 | int force, bool offlining, | 928 | int force, enum migrate_mode mode) |
942 | enum migrate_mode mode) | ||
943 | { | 929 | { |
944 | int rc = 0; | 930 | int rc = 0; |
945 | int *result = NULL; | 931 | int *result = NULL; |
@@ -1001,9 +987,8 @@ out: | |||
1001 | * | 987 | * |
1002 | * Return: Number of pages not migrated or error code. | 988 | * Return: Number of pages not migrated or error code. |
1003 | */ | 989 | */ |
1004 | int migrate_pages(struct list_head *from, | 990 | int migrate_pages(struct list_head *from, new_page_t get_new_page, |
1005 | new_page_t get_new_page, unsigned long private, bool offlining, | 991 | unsigned long private, enum migrate_mode mode, int reason) |
1006 | enum migrate_mode mode, int reason) | ||
1007 | { | 992 | { |
1008 | int retry = 1; | 993 | int retry = 1; |
1009 | int nr_failed = 0; | 994 | int nr_failed = 0; |
@@ -1024,8 +1009,7 @@ int migrate_pages(struct list_head *from, | |||
1024 | cond_resched(); | 1009 | cond_resched(); |
1025 | 1010 | ||
1026 | rc = unmap_and_move(get_new_page, private, | 1011 | rc = unmap_and_move(get_new_page, private, |
1027 | page, pass > 2, offlining, | 1012 | page, pass > 2, mode); |
1028 | mode); | ||
1029 | 1013 | ||
1030 | switch(rc) { | 1014 | switch(rc) { |
1031 | case -ENOMEM: | 1015 | case -ENOMEM: |
@@ -1058,15 +1042,13 @@ out: | |||
1058 | } | 1042 | } |
1059 | 1043 | ||
1060 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, | 1044 | int migrate_huge_page(struct page *hpage, new_page_t get_new_page, |
1061 | unsigned long private, bool offlining, | 1045 | unsigned long private, enum migrate_mode mode) |
1062 | enum migrate_mode mode) | ||
1063 | { | 1046 | { |
1064 | int pass, rc; | 1047 | int pass, rc; |
1065 | 1048 | ||
1066 | for (pass = 0; pass < 10; pass++) { | 1049 | for (pass = 0; pass < 10; pass++) { |
1067 | rc = unmap_and_move_huge_page(get_new_page, | 1050 | rc = unmap_and_move_huge_page(get_new_page, private, |
1068 | private, hpage, pass > 2, offlining, | 1051 | hpage, pass > 2, mode); |
1069 | mode); | ||
1070 | switch (rc) { | 1052 | switch (rc) { |
1071 | case -ENOMEM: | 1053 | case -ENOMEM: |
1072 | goto out; | 1054 | goto out; |
@@ -1152,7 +1134,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
1152 | goto set_status; | 1134 | goto set_status; |
1153 | 1135 | ||
1154 | /* Use PageReserved to check for zero page */ | 1136 | /* Use PageReserved to check for zero page */ |
1155 | if (PageReserved(page) || PageKsm(page)) | 1137 | if (PageReserved(page)) |
1156 | goto put_and_set; | 1138 | goto put_and_set; |
1157 | 1139 | ||
1158 | pp->page = page; | 1140 | pp->page = page; |
@@ -1189,8 +1171,7 @@ set_status: | |||
1189 | err = 0; | 1171 | err = 0; |
1190 | if (!list_empty(&pagelist)) { | 1172 | if (!list_empty(&pagelist)) { |
1191 | err = migrate_pages(&pagelist, new_page_node, | 1173 | err = migrate_pages(&pagelist, new_page_node, |
1192 | (unsigned long)pm, 0, MIGRATE_SYNC, | 1174 | (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); |
1193 | MR_SYSCALL); | ||
1194 | if (err) | 1175 | if (err) |
1195 | putback_lru_pages(&pagelist); | 1176 | putback_lru_pages(&pagelist); |
1196 | } | 1177 | } |
@@ -1314,7 +1295,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, | |||
1314 | 1295 | ||
1315 | err = -ENOENT; | 1296 | err = -ENOENT; |
1316 | /* Use PageReserved to check for zero page */ | 1297 | /* Use PageReserved to check for zero page */ |
1317 | if (!page || PageReserved(page) || PageKsm(page)) | 1298 | if (!page || PageReserved(page)) |
1318 | goto set_status; | 1299 | goto set_status; |
1319 | 1300 | ||
1320 | err = page_to_nid(page); | 1301 | err = page_to_nid(page); |
@@ -1461,7 +1442,7 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, | |||
1461 | * pages. Currently it only checks the watermarks which crude | 1442 | * pages. Currently it only checks the watermarks which crude |
1462 | */ | 1443 | */ |
1463 | static bool migrate_balanced_pgdat(struct pglist_data *pgdat, | 1444 | static bool migrate_balanced_pgdat(struct pglist_data *pgdat, |
1464 | int nr_migrate_pages) | 1445 | unsigned long nr_migrate_pages) |
1465 | { | 1446 | { |
1466 | int z; | 1447 | int z; |
1467 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { | 1448 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { |
@@ -1497,7 +1478,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, | |||
1497 | __GFP_NOWARN) & | 1478 | __GFP_NOWARN) & |
1498 | ~GFP_IOFS, 0); | 1479 | ~GFP_IOFS, 0); |
1499 | if (newpage) | 1480 | if (newpage) |
1500 | page_xchg_last_nid(newpage, page_last_nid(page)); | 1481 | page_nid_xchg_last(newpage, page_nid_last(page)); |
1501 | 1482 | ||
1502 | return newpage; | 1483 | return newpage; |
1503 | } | 1484 | } |
@@ -1557,39 +1538,40 @@ bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) | |||
1557 | 1538 | ||
1558 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | 1539 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) |
1559 | { | 1540 | { |
1560 | int ret = 0; | 1541 | int page_lru; |
1542 | |||
1543 | VM_BUG_ON(compound_order(page) && !PageTransHuge(page)); | ||
1561 | 1544 | ||
1562 | /* Avoid migrating to a node that is nearly full */ | 1545 | /* Avoid migrating to a node that is nearly full */ |
1563 | if (migrate_balanced_pgdat(pgdat, 1)) { | 1546 | if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) |
1564 | int page_lru; | 1547 | return 0; |
1565 | 1548 | ||
1566 | if (isolate_lru_page(page)) { | 1549 | if (isolate_lru_page(page)) |
1567 | put_page(page); | 1550 | return 0; |
1568 | return 0; | ||
1569 | } | ||
1570 | 1551 | ||
1571 | /* Page is isolated */ | 1552 | /* |
1572 | ret = 1; | 1553 | * migrate_misplaced_transhuge_page() skips page migration's usual |
1573 | page_lru = page_is_file_cache(page); | 1554 | * check on page_count(), so we must do it here, now that the page |
1574 | if (!PageTransHuge(page)) | 1555 | * has been isolated: a GUP pin, or any other pin, prevents migration. |
1575 | inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); | 1556 | * The expected page count is 3: 1 for page's mapcount and 1 for the |
1576 | else | 1557 | * caller's pin and 1 for the reference taken by isolate_lru_page(). |
1577 | mod_zone_page_state(page_zone(page), | 1558 | */ |
1578 | NR_ISOLATED_ANON + page_lru, | 1559 | if (PageTransHuge(page) && page_count(page) != 3) { |
1579 | HPAGE_PMD_NR); | 1560 | putback_lru_page(page); |
1561 | return 0; | ||
1580 | } | 1562 | } |
1581 | 1563 | ||
1564 | page_lru = page_is_file_cache(page); | ||
1565 | mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, | ||
1566 | hpage_nr_pages(page)); | ||
1567 | |||
1582 | /* | 1568 | /* |
1583 | * Page is either isolated or there is not enough space on the target | 1569 | * Isolating the page has taken another reference, so the |
1584 | * node. If isolated, then it has taken a reference count and the | 1570 | * caller's reference can be safely dropped without the page |
1585 | * callers reference can be safely dropped without the page | 1571 | * disappearing underneath us during migration. |
1586 | * disappearing underneath us during migration. Otherwise the page is | ||
1587 | * not to be migrated but the callers reference should still be | ||
1588 | * dropped so it does not leak. | ||
1589 | */ | 1572 | */ |
1590 | put_page(page); | 1573 | put_page(page); |
1591 | 1574 | return 1; | |
1592 | return ret; | ||
1593 | } | 1575 | } |
1594 | 1576 | ||
1595 | /* | 1577 | /* |
@@ -1600,7 +1582,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | |||
1600 | int migrate_misplaced_page(struct page *page, int node) | 1582 | int migrate_misplaced_page(struct page *page, int node) |
1601 | { | 1583 | { |
1602 | pg_data_t *pgdat = NODE_DATA(node); | 1584 | pg_data_t *pgdat = NODE_DATA(node); |
1603 | int isolated = 0; | 1585 | int isolated; |
1604 | int nr_remaining; | 1586 | int nr_remaining; |
1605 | LIST_HEAD(migratepages); | 1587 | LIST_HEAD(migratepages); |
1606 | 1588 | ||
@@ -1608,42 +1590,43 @@ int migrate_misplaced_page(struct page *page, int node) | |||
1608 | * Don't migrate pages that are mapped in multiple processes. | 1590 | * Don't migrate pages that are mapped in multiple processes. |
1609 | * TODO: Handle false sharing detection instead of this hammer | 1591 | * TODO: Handle false sharing detection instead of this hammer |
1610 | */ | 1592 | */ |
1611 | if (page_mapcount(page) != 1) { | 1593 | if (page_mapcount(page) != 1) |
1612 | put_page(page); | ||
1613 | goto out; | 1594 | goto out; |
1614 | } | ||
1615 | 1595 | ||
1616 | /* | 1596 | /* |
1617 | * Rate-limit the amount of data that is being migrated to a node. | 1597 | * Rate-limit the amount of data that is being migrated to a node. |
1618 | * Optimal placement is no good if the memory bus is saturated and | 1598 | * Optimal placement is no good if the memory bus is saturated and |
1619 | * all the time is being spent migrating! | 1599 | * all the time is being spent migrating! |
1620 | */ | 1600 | */ |
1621 | if (numamigrate_update_ratelimit(pgdat, 1)) { | 1601 | if (numamigrate_update_ratelimit(pgdat, 1)) |
1622 | put_page(page); | ||
1623 | goto out; | 1602 | goto out; |
1624 | } | ||
1625 | 1603 | ||
1626 | isolated = numamigrate_isolate_page(pgdat, page); | 1604 | isolated = numamigrate_isolate_page(pgdat, page); |
1627 | if (!isolated) | 1605 | if (!isolated) |
1628 | goto out; | 1606 | goto out; |
1629 | 1607 | ||
1630 | list_add(&page->lru, &migratepages); | 1608 | list_add(&page->lru, &migratepages); |
1631 | nr_remaining = migrate_pages(&migratepages, | 1609 | nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, |
1632 | alloc_misplaced_dst_page, | 1610 | node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); |
1633 | node, false, MIGRATE_ASYNC, | ||
1634 | MR_NUMA_MISPLACED); | ||
1635 | if (nr_remaining) { | 1611 | if (nr_remaining) { |
1636 | putback_lru_pages(&migratepages); | 1612 | putback_lru_pages(&migratepages); |
1637 | isolated = 0; | 1613 | isolated = 0; |
1638 | } else | 1614 | } else |
1639 | count_vm_numa_event(NUMA_PAGE_MIGRATE); | 1615 | count_vm_numa_event(NUMA_PAGE_MIGRATE); |
1640 | BUG_ON(!list_empty(&migratepages)); | 1616 | BUG_ON(!list_empty(&migratepages)); |
1641 | out: | ||
1642 | return isolated; | 1617 | return isolated; |
1618 | |||
1619 | out: | ||
1620 | put_page(page); | ||
1621 | return 0; | ||
1643 | } | 1622 | } |
1644 | #endif /* CONFIG_NUMA_BALANCING */ | 1623 | #endif /* CONFIG_NUMA_BALANCING */ |
1645 | 1624 | ||
1646 | #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) | 1625 | #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) |
1626 | /* | ||
1627 | * Migrates a THP to a given target node. page must be locked and is unlocked | ||
1628 | * before returning. | ||
1629 | */ | ||
1647 | int migrate_misplaced_transhuge_page(struct mm_struct *mm, | 1630 | int migrate_misplaced_transhuge_page(struct mm_struct *mm, |
1648 | struct vm_area_struct *vma, | 1631 | struct vm_area_struct *vma, |
1649 | pmd_t *pmd, pmd_t entry, | 1632 | pmd_t *pmd, pmd_t entry, |
@@ -1674,29 +1657,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1674 | 1657 | ||
1675 | new_page = alloc_pages_node(node, | 1658 | new_page = alloc_pages_node(node, |
1676 | (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); | 1659 | (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); |
1677 | if (!new_page) { | 1660 | if (!new_page) |
1678 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | 1661 | goto out_fail; |
1679 | goto out_dropref; | ||
1680 | } | ||
1681 | page_xchg_last_nid(new_page, page_last_nid(page)); | ||
1682 | 1662 | ||
1683 | isolated = numamigrate_isolate_page(pgdat, page); | 1663 | page_nid_xchg_last(new_page, page_nid_last(page)); |
1684 | 1664 | ||
1685 | /* | 1665 | isolated = numamigrate_isolate_page(pgdat, page); |
1686 | * Failing to isolate or a GUP pin prevents migration. The expected | 1666 | if (!isolated) { |
1687 | * page count is 2. 1 for anonymous pages without a mapping and 1 | ||
1688 | * for the callers pin. If the page was isolated, the page will | ||
1689 | * need to be put back on the LRU. | ||
1690 | */ | ||
1691 | if (!isolated || page_count(page) != 2) { | ||
1692 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1693 | put_page(new_page); | 1667 | put_page(new_page); |
1694 | if (isolated) { | 1668 | goto out_fail; |
1695 | putback_lru_page(page); | ||
1696 | isolated = 0; | ||
1697 | goto out; | ||
1698 | } | ||
1699 | goto out_keep_locked; | ||
1700 | } | 1669 | } |
1701 | 1670 | ||
1702 | /* Prepare a page as a migration target */ | 1671 | /* Prepare a page as a migration target */ |
@@ -1728,6 +1697,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1728 | putback_lru_page(page); | 1697 | putback_lru_page(page); |
1729 | 1698 | ||
1730 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | 1699 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); |
1700 | isolated = 0; | ||
1731 | goto out; | 1701 | goto out; |
1732 | } | 1702 | } |
1733 | 1703 | ||
@@ -1772,9 +1742,11 @@ out: | |||
1772 | -HPAGE_PMD_NR); | 1742 | -HPAGE_PMD_NR); |
1773 | return isolated; | 1743 | return isolated; |
1774 | 1744 | ||
1745 | out_fail: | ||
1746 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1775 | out_dropref: | 1747 | out_dropref: |
1748 | unlock_page(page); | ||
1776 | put_page(page); | 1749 | put_page(page); |
1777 | out_keep_locked: | ||
1778 | return 0; | 1750 | return 0; |
1779 | } | 1751 | } |
1780 | #endif /* CONFIG_NUMA_BALANCING */ | 1752 | #endif /* CONFIG_NUMA_BALANCING */ |
diff --git a/mm/mincore.c b/mm/mincore.c index 936b4cee8cb1..da2be56a7b8f 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
75 | /* shmem/tmpfs may return swap: account for swapcache page too. */ | 75 | /* shmem/tmpfs may return swap: account for swapcache page too. */ |
76 | if (radix_tree_exceptional_entry(page)) { | 76 | if (radix_tree_exceptional_entry(page)) { |
77 | swp_entry_t swap = radix_to_swp_entry(page); | 77 | swp_entry_t swap = radix_to_swp_entry(page); |
78 | page = find_get_page(&swapper_space, swap.val); | 78 | page = find_get_page(swap_address_space(swap), swap.val); |
79 | } | 79 | } |
80 | #endif | 80 | #endif |
81 | if (page) { | 81 | if (page) { |
@@ -135,7 +135,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
135 | } else { | 135 | } else { |
136 | #ifdef CONFIG_SWAP | 136 | #ifdef CONFIG_SWAP |
137 | pgoff = entry.val; | 137 | pgoff = entry.val; |
138 | *vec = mincore_page(&swapper_space, pgoff); | 138 | *vec = mincore_page(swap_address_space(entry), |
139 | pgoff); | ||
139 | #else | 140 | #else |
140 | WARN_ON(1); | 141 | WARN_ON(1); |
141 | *vec = 1; | 142 | *vec = 1; |
diff --git a/mm/mlock.c b/mm/mlock.c index c9bd528b01d2..e6638f565d42 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -155,13 +155,12 @@ void munlock_vma_page(struct page *page) | |||
155 | * | 155 | * |
156 | * vma->vm_mm->mmap_sem must be held for at least read. | 156 | * vma->vm_mm->mmap_sem must be held for at least read. |
157 | */ | 157 | */ |
158 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | 158 | long __mlock_vma_pages_range(struct vm_area_struct *vma, |
159 | unsigned long start, unsigned long end, | 159 | unsigned long start, unsigned long end, int *nonblocking) |
160 | int *nonblocking) | ||
161 | { | 160 | { |
162 | struct mm_struct *mm = vma->vm_mm; | 161 | struct mm_struct *mm = vma->vm_mm; |
163 | unsigned long addr = start; | 162 | unsigned long addr = start; |
164 | int nr_pages = (end - start) / PAGE_SIZE; | 163 | unsigned long nr_pages = (end - start) / PAGE_SIZE; |
165 | int gup_flags; | 164 | int gup_flags; |
166 | 165 | ||
167 | VM_BUG_ON(start & ~PAGE_MASK); | 166 | VM_BUG_ON(start & ~PAGE_MASK); |
@@ -186,6 +185,10 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
186 | if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) | 185 | if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) |
187 | gup_flags |= FOLL_FORCE; | 186 | gup_flags |= FOLL_FORCE; |
188 | 187 | ||
188 | /* | ||
189 | * We made sure addr is within a VMA, so the following will | ||
190 | * not result in a stack expansion that recurses back here. | ||
191 | */ | ||
189 | return __get_user_pages(current, mm, addr, nr_pages, gup_flags, | 192 | return __get_user_pages(current, mm, addr, nr_pages, gup_flags, |
190 | NULL, NULL, nonblocking); | 193 | NULL, NULL, nonblocking); |
191 | } | 194 | } |
@@ -202,56 +205,6 @@ static int __mlock_posix_error_return(long retval) | |||
202 | return retval; | 205 | return retval; |
203 | } | 206 | } |
204 | 207 | ||
205 | /** | ||
206 | * mlock_vma_pages_range() - mlock pages in specified vma range. | ||
207 | * @vma - the vma containing the specfied address range | ||
208 | * @start - starting address in @vma to mlock | ||
209 | * @end - end address [+1] in @vma to mlock | ||
210 | * | ||
211 | * For mmap()/mremap()/expansion of mlocked vma. | ||
212 | * | ||
213 | * return 0 on success for "normal" vmas. | ||
214 | * | ||
215 | * return number of pages [> 0] to be removed from locked_vm on success | ||
216 | * of "special" vmas. | ||
217 | */ | ||
218 | long mlock_vma_pages_range(struct vm_area_struct *vma, | ||
219 | unsigned long start, unsigned long end) | ||
220 | { | ||
221 | int nr_pages = (end - start) / PAGE_SIZE; | ||
222 | BUG_ON(!(vma->vm_flags & VM_LOCKED)); | ||
223 | |||
224 | /* | ||
225 | * filter unlockable vmas | ||
226 | */ | ||
227 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | ||
228 | goto no_mlock; | ||
229 | |||
230 | if (!((vma->vm_flags & VM_DONTEXPAND) || | ||
231 | is_vm_hugetlb_page(vma) || | ||
232 | vma == get_gate_vma(current->mm))) { | ||
233 | |||
234 | __mlock_vma_pages_range(vma, start, end, NULL); | ||
235 | |||
236 | /* Hide errors from mmap() and other callers */ | ||
237 | return 0; | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | * User mapped kernel pages or huge pages: | ||
242 | * make these pages present to populate the ptes, but | ||
243 | * fall thru' to reset VM_LOCKED--no need to unlock, and | ||
244 | * return nr_pages so these don't get counted against task's | ||
245 | * locked limit. huge pages are already counted against | ||
246 | * locked vm limit. | ||
247 | */ | ||
248 | make_pages_present(start, end); | ||
249 | |||
250 | no_mlock: | ||
251 | vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ | ||
252 | return nr_pages; /* error or pages NOT mlocked */ | ||
253 | } | ||
254 | |||
255 | /* | 208 | /* |
256 | * munlock_vma_pages_range() - munlock all pages in the vma range.' | 209 | * munlock_vma_pages_range() - munlock all pages in the vma range.' |
257 | * @vma - vma containing range to be munlock()ed. | 210 | * @vma - vma containing range to be munlock()ed. |
@@ -303,7 +256,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
303 | * | 256 | * |
304 | * Filters out "special" vmas -- VM_LOCKED never gets set for these, and | 257 | * Filters out "special" vmas -- VM_LOCKED never gets set for these, and |
305 | * munlock is a no-op. However, for some special vmas, we go ahead and | 258 | * munlock is a no-op. However, for some special vmas, we go ahead and |
306 | * populate the ptes via make_pages_present(). | 259 | * populate the ptes. |
307 | * | 260 | * |
308 | * For vmas that pass the filters, merge/split as appropriate. | 261 | * For vmas that pass the filters, merge/split as appropriate. |
309 | */ | 262 | */ |
@@ -391,9 +344,9 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
391 | 344 | ||
392 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ | 345 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ |
393 | 346 | ||
394 | newflags = vma->vm_flags | VM_LOCKED; | 347 | newflags = vma->vm_flags & ~VM_LOCKED; |
395 | if (!on) | 348 | if (on) |
396 | newflags &= ~VM_LOCKED; | 349 | newflags |= VM_LOCKED | VM_POPULATE; |
397 | 350 | ||
398 | tmp = vma->vm_end; | 351 | tmp = vma->vm_end; |
399 | if (tmp > end) | 352 | if (tmp > end) |
@@ -416,13 +369,20 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
416 | return error; | 369 | return error; |
417 | } | 370 | } |
418 | 371 | ||
419 | static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) | 372 | /* |
373 | * __mm_populate - populate and/or mlock pages within a range of address space. | ||
374 | * | ||
375 | * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap | ||
376 | * flags. VMAs must be already marked with the desired vm_flags, and | ||
377 | * mmap_sem must not be held. | ||
378 | */ | ||
379 | int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) | ||
420 | { | 380 | { |
421 | struct mm_struct *mm = current->mm; | 381 | struct mm_struct *mm = current->mm; |
422 | unsigned long end, nstart, nend; | 382 | unsigned long end, nstart, nend; |
423 | struct vm_area_struct *vma = NULL; | 383 | struct vm_area_struct *vma = NULL; |
424 | int locked = 0; | 384 | int locked = 0; |
425 | int ret = 0; | 385 | long ret = 0; |
426 | 386 | ||
427 | VM_BUG_ON(start & ~PAGE_MASK); | 387 | VM_BUG_ON(start & ~PAGE_MASK); |
428 | VM_BUG_ON(len != PAGE_ALIGN(len)); | 388 | VM_BUG_ON(len != PAGE_ALIGN(len)); |
@@ -446,7 +406,8 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) | |||
446 | * range with the first VMA. Also, skip undesirable VMA types. | 406 | * range with the first VMA. Also, skip undesirable VMA types. |
447 | */ | 407 | */ |
448 | nend = min(end, vma->vm_end); | 408 | nend = min(end, vma->vm_end); |
449 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | 409 | if ((vma->vm_flags & (VM_IO | VM_PFNMAP | VM_POPULATE)) != |
410 | VM_POPULATE) | ||
450 | continue; | 411 | continue; |
451 | if (nstart < vma->vm_start) | 412 | if (nstart < vma->vm_start) |
452 | nstart = vma->vm_start; | 413 | nstart = vma->vm_start; |
@@ -498,7 +459,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
498 | error = do_mlock(start, len, 1); | 459 | error = do_mlock(start, len, 1); |
499 | up_write(¤t->mm->mmap_sem); | 460 | up_write(¤t->mm->mmap_sem); |
500 | if (!error) | 461 | if (!error) |
501 | error = do_mlock_pages(start, len, 0); | 462 | error = __mm_populate(start, len, 0); |
502 | return error; | 463 | return error; |
503 | } | 464 | } |
504 | 465 | ||
@@ -519,18 +480,18 @@ static int do_mlockall(int flags) | |||
519 | struct vm_area_struct * vma, * prev = NULL; | 480 | struct vm_area_struct * vma, * prev = NULL; |
520 | 481 | ||
521 | if (flags & MCL_FUTURE) | 482 | if (flags & MCL_FUTURE) |
522 | current->mm->def_flags |= VM_LOCKED; | 483 | current->mm->def_flags |= VM_LOCKED | VM_POPULATE; |
523 | else | 484 | else |
524 | current->mm->def_flags &= ~VM_LOCKED; | 485 | current->mm->def_flags &= ~(VM_LOCKED | VM_POPULATE); |
525 | if (flags == MCL_FUTURE) | 486 | if (flags == MCL_FUTURE) |
526 | goto out; | 487 | goto out; |
527 | 488 | ||
528 | for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { | 489 | for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { |
529 | vm_flags_t newflags; | 490 | vm_flags_t newflags; |
530 | 491 | ||
531 | newflags = vma->vm_flags | VM_LOCKED; | 492 | newflags = vma->vm_flags & ~VM_LOCKED; |
532 | if (!(flags & MCL_CURRENT)) | 493 | if (flags & MCL_CURRENT) |
533 | newflags &= ~VM_LOCKED; | 494 | newflags |= VM_LOCKED | VM_POPULATE; |
534 | 495 | ||
535 | /* Ignore errors */ | 496 | /* Ignore errors */ |
536 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); | 497 | mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); |
@@ -564,10 +525,8 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
564 | capable(CAP_IPC_LOCK)) | 525 | capable(CAP_IPC_LOCK)) |
565 | ret = do_mlockall(flags); | 526 | ret = do_mlockall(flags); |
566 | up_write(¤t->mm->mmap_sem); | 527 | up_write(¤t->mm->mmap_sem); |
567 | if (!ret && (flags & MCL_CURRENT)) { | 528 | if (!ret && (flags & MCL_CURRENT)) |
568 | /* Ignore errors */ | 529 | mm_populate(0, TASK_SIZE); |
569 | do_mlock_pages(0, TASK_SIZE, 1); | ||
570 | } | ||
571 | out: | 530 | out: |
572 | return ret; | 531 | return ret; |
573 | } | 532 | } |
diff --git a/mm/mm_init.c b/mm/mm_init.c index 1ffd97ae26d7..c280a02ea11e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
@@ -69,34 +69,41 @@ void __init mminit_verify_pageflags_layout(void) | |||
69 | unsigned long or_mask, add_mask; | 69 | unsigned long or_mask, add_mask; |
70 | 70 | ||
71 | shift = 8 * sizeof(unsigned long); | 71 | shift = 8 * sizeof(unsigned long); |
72 | width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; | 72 | width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; |
73 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", | 73 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", |
74 | "Section %d Node %d Zone %d Flags %d\n", | 74 | "Section %d Node %d Zone %d Lastnid %d Flags %d\n", |
75 | SECTIONS_WIDTH, | 75 | SECTIONS_WIDTH, |
76 | NODES_WIDTH, | 76 | NODES_WIDTH, |
77 | ZONES_WIDTH, | 77 | ZONES_WIDTH, |
78 | LAST_NID_WIDTH, | ||
78 | NR_PAGEFLAGS); | 79 | NR_PAGEFLAGS); |
79 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", | 80 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", |
80 | "Section %d Node %d Zone %d\n", | 81 | "Section %d Node %d Zone %d Lastnid %d\n", |
81 | SECTIONS_SHIFT, | 82 | SECTIONS_SHIFT, |
82 | NODES_SHIFT, | 83 | NODES_SHIFT, |
83 | ZONES_SHIFT); | 84 | ZONES_SHIFT, |
84 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", | 85 | LAST_NID_SHIFT); |
85 | "Section %lu Node %lu Zone %lu\n", | 86 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", |
87 | "Section %lu Node %lu Zone %lu Lastnid %lu\n", | ||
86 | (unsigned long)SECTIONS_PGSHIFT, | 88 | (unsigned long)SECTIONS_PGSHIFT, |
87 | (unsigned long)NODES_PGSHIFT, | 89 | (unsigned long)NODES_PGSHIFT, |
88 | (unsigned long)ZONES_PGSHIFT); | 90 | (unsigned long)ZONES_PGSHIFT, |
89 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", | 91 | (unsigned long)LAST_NID_PGSHIFT); |
90 | "Zone ID: %lu -> %lu\n", | 92 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", |
91 | (unsigned long)ZONEID_PGOFF, | 93 | "Node/Zone ID: %lu -> %lu\n", |
92 | (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); | 94 | (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), |
95 | (unsigned long)ZONEID_PGOFF); | ||
93 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", | 96 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", |
94 | "location: %d -> %d unused %d -> %d flags %d -> %d\n", | 97 | "location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n", |
95 | shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); | 98 | shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); |
96 | #ifdef NODE_NOT_IN_PAGE_FLAGS | 99 | #ifdef NODE_NOT_IN_PAGE_FLAGS |
97 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", | 100 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", |
98 | "Node not in page flags"); | 101 | "Node not in page flags"); |
99 | #endif | 102 | #endif |
103 | #ifdef LAST_NID_NOT_IN_PAGE_FLAGS | ||
104 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", | ||
105 | "Last nid not in page flags"); | ||
106 | #endif | ||
100 | 107 | ||
101 | if (SECTIONS_WIDTH) { | 108 | if (SECTIONS_WIDTH) { |
102 | shift -= SECTIONS_WIDTH; | 109 | shift -= SECTIONS_WIDTH; |
@@ -144,7 +144,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
144 | */ | 144 | */ |
145 | free -= global_page_state(NR_SHMEM); | 145 | free -= global_page_state(NR_SHMEM); |
146 | 146 | ||
147 | free += nr_swap_pages; | 147 | free += get_nr_swap_pages(); |
148 | 148 | ||
149 | /* | 149 | /* |
150 | * Any slabs which are created with the | 150 | * Any slabs which are created with the |
@@ -256,6 +256,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
256 | unsigned long newbrk, oldbrk; | 256 | unsigned long newbrk, oldbrk; |
257 | struct mm_struct *mm = current->mm; | 257 | struct mm_struct *mm = current->mm; |
258 | unsigned long min_brk; | 258 | unsigned long min_brk; |
259 | bool populate; | ||
259 | 260 | ||
260 | down_write(&mm->mmap_sem); | 261 | down_write(&mm->mmap_sem); |
261 | 262 | ||
@@ -305,8 +306,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
305 | /* Ok, looks good - let it rip. */ | 306 | /* Ok, looks good - let it rip. */ |
306 | if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) | 307 | if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) |
307 | goto out; | 308 | goto out; |
309 | |||
308 | set_brk: | 310 | set_brk: |
309 | mm->brk = brk; | 311 | mm->brk = brk; |
312 | populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0; | ||
313 | up_write(&mm->mmap_sem); | ||
314 | if (populate) | ||
315 | mm_populate(oldbrk, newbrk - oldbrk); | ||
316 | return brk; | ||
317 | |||
310 | out: | 318 | out: |
311 | retval = mm->brk; | 319 | retval = mm->brk; |
312 | up_write(&mm->mmap_sem); | 320 | up_write(&mm->mmap_sem); |
@@ -801,7 +809,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
801 | anon_vma_interval_tree_post_update_vma(vma); | 809 | anon_vma_interval_tree_post_update_vma(vma); |
802 | if (adjust_next) | 810 | if (adjust_next) |
803 | anon_vma_interval_tree_post_update_vma(next); | 811 | anon_vma_interval_tree_post_update_vma(next); |
804 | anon_vma_unlock(anon_vma); | 812 | anon_vma_unlock_write(anon_vma); |
805 | } | 813 | } |
806 | if (mapping) | 814 | if (mapping) |
807 | mutex_unlock(&mapping->i_mmap_mutex); | 815 | mutex_unlock(&mapping->i_mmap_mutex); |
@@ -1154,12 +1162,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint) | |||
1154 | 1162 | ||
1155 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | 1163 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
1156 | unsigned long len, unsigned long prot, | 1164 | unsigned long len, unsigned long prot, |
1157 | unsigned long flags, unsigned long pgoff) | 1165 | unsigned long flags, unsigned long pgoff, |
1166 | unsigned long *populate) | ||
1158 | { | 1167 | { |
1159 | struct mm_struct * mm = current->mm; | 1168 | struct mm_struct * mm = current->mm; |
1160 | struct inode *inode; | 1169 | struct inode *inode; |
1161 | vm_flags_t vm_flags; | 1170 | vm_flags_t vm_flags; |
1162 | 1171 | ||
1172 | *populate = 0; | ||
1173 | |||
1163 | /* | 1174 | /* |
1164 | * Does the application expect PROT_READ to imply PROT_EXEC? | 1175 | * Does the application expect PROT_READ to imply PROT_EXEC? |
1165 | * | 1176 | * |
@@ -1280,7 +1291,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1280 | } | 1291 | } |
1281 | } | 1292 | } |
1282 | 1293 | ||
1283 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); | 1294 | /* |
1295 | * Set 'VM_NORESERVE' if we should not account for the | ||
1296 | * memory use of this mapping. | ||
1297 | */ | ||
1298 | if (flags & MAP_NORESERVE) { | ||
1299 | /* We honor MAP_NORESERVE if allowed to overcommit */ | ||
1300 | if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) | ||
1301 | vm_flags |= VM_NORESERVE; | ||
1302 | |||
1303 | /* hugetlb applies strict overcommit unless MAP_NORESERVE */ | ||
1304 | if (file && is_file_hugepages(file)) | ||
1305 | vm_flags |= VM_NORESERVE; | ||
1306 | } | ||
1307 | |||
1308 | addr = mmap_region(file, addr, len, vm_flags, pgoff); | ||
1309 | if (!IS_ERR_VALUE(addr) && (vm_flags & VM_POPULATE)) | ||
1310 | *populate = len; | ||
1311 | return addr; | ||
1284 | } | 1312 | } |
1285 | 1313 | ||
1286 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | 1314 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, |
@@ -1395,8 +1423,7 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) | |||
1395 | } | 1423 | } |
1396 | 1424 | ||
1397 | unsigned long mmap_region(struct file *file, unsigned long addr, | 1425 | unsigned long mmap_region(struct file *file, unsigned long addr, |
1398 | unsigned long len, unsigned long flags, | 1426 | unsigned long len, vm_flags_t vm_flags, unsigned long pgoff) |
1399 | vm_flags_t vm_flags, unsigned long pgoff) | ||
1400 | { | 1427 | { |
1401 | struct mm_struct *mm = current->mm; | 1428 | struct mm_struct *mm = current->mm; |
1402 | struct vm_area_struct *vma, *prev; | 1429 | struct vm_area_struct *vma, *prev; |
@@ -1420,20 +1447,6 @@ munmap_back: | |||
1420 | return -ENOMEM; | 1447 | return -ENOMEM; |
1421 | 1448 | ||
1422 | /* | 1449 | /* |
1423 | * Set 'VM_NORESERVE' if we should not account for the | ||
1424 | * memory use of this mapping. | ||
1425 | */ | ||
1426 | if ((flags & MAP_NORESERVE)) { | ||
1427 | /* We honor MAP_NORESERVE if allowed to overcommit */ | ||
1428 | if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) | ||
1429 | vm_flags |= VM_NORESERVE; | ||
1430 | |||
1431 | /* hugetlb applies strict overcommit unless MAP_NORESERVE */ | ||
1432 | if (file && is_file_hugepages(file)) | ||
1433 | vm_flags |= VM_NORESERVE; | ||
1434 | } | ||
1435 | |||
1436 | /* | ||
1437 | * Private writable mapping: check memory availability | 1450 | * Private writable mapping: check memory availability |
1438 | */ | 1451 | */ |
1439 | if (accountable_mapping(file, vm_flags)) { | 1452 | if (accountable_mapping(file, vm_flags)) { |
@@ -1531,10 +1544,12 @@ out: | |||
1531 | 1544 | ||
1532 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1545 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1533 | if (vm_flags & VM_LOCKED) { | 1546 | if (vm_flags & VM_LOCKED) { |
1534 | if (!mlock_vma_pages_range(vma, addr, addr + len)) | 1547 | if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || |
1548 | vma == get_gate_vma(current->mm))) | ||
1535 | mm->locked_vm += (len >> PAGE_SHIFT); | 1549 | mm->locked_vm += (len >> PAGE_SHIFT); |
1536 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | 1550 | else |
1537 | make_pages_present(addr, addr + len); | 1551 | vma->vm_flags &= ~VM_LOCKED; |
1552 | } | ||
1538 | 1553 | ||
1539 | if (file) | 1554 | if (file) |
1540 | uprobe_mmap(vma); | 1555 | uprobe_mmap(vma); |
@@ -2187,9 +2202,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
2187 | return vma; | 2202 | return vma; |
2188 | if (!prev || expand_stack(prev, addr)) | 2203 | if (!prev || expand_stack(prev, addr)) |
2189 | return NULL; | 2204 | return NULL; |
2190 | if (prev->vm_flags & VM_LOCKED) { | 2205 | if (prev->vm_flags & VM_LOCKED) |
2191 | mlock_vma_pages_range(prev, addr, prev->vm_end); | 2206 | __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); |
2192 | } | ||
2193 | return prev; | 2207 | return prev; |
2194 | } | 2208 | } |
2195 | #else | 2209 | #else |
@@ -2215,9 +2229,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
2215 | start = vma->vm_start; | 2229 | start = vma->vm_start; |
2216 | if (expand_stack(vma, addr)) | 2230 | if (expand_stack(vma, addr)) |
2217 | return NULL; | 2231 | return NULL; |
2218 | if (vma->vm_flags & VM_LOCKED) { | 2232 | if (vma->vm_flags & VM_LOCKED) |
2219 | mlock_vma_pages_range(vma, addr, start); | 2233 | __mlock_vma_pages_range(vma, addr, start, NULL); |
2220 | } | ||
2221 | return vma; | 2234 | return vma; |
2222 | } | 2235 | } |
2223 | #endif | 2236 | #endif |
@@ -2590,10 +2603,8 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2590 | out: | 2603 | out: |
2591 | perf_event_mmap(vma); | 2604 | perf_event_mmap(vma); |
2592 | mm->total_vm += len >> PAGE_SHIFT; | 2605 | mm->total_vm += len >> PAGE_SHIFT; |
2593 | if (flags & VM_LOCKED) { | 2606 | if (flags & VM_LOCKED) |
2594 | if (!mlock_vma_pages_range(vma, addr, addr + len)) | 2607 | mm->locked_vm += (len >> PAGE_SHIFT); |
2595 | mm->locked_vm += (len >> PAGE_SHIFT); | ||
2596 | } | ||
2597 | return addr; | 2608 | return addr; |
2598 | } | 2609 | } |
2599 | 2610 | ||
@@ -2601,10 +2612,14 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) | |||
2601 | { | 2612 | { |
2602 | struct mm_struct *mm = current->mm; | 2613 | struct mm_struct *mm = current->mm; |
2603 | unsigned long ret; | 2614 | unsigned long ret; |
2615 | bool populate; | ||
2604 | 2616 | ||
2605 | down_write(&mm->mmap_sem); | 2617 | down_write(&mm->mmap_sem); |
2606 | ret = do_brk(addr, len); | 2618 | ret = do_brk(addr, len); |
2619 | populate = ((mm->def_flags & VM_LOCKED) != 0); | ||
2607 | up_write(&mm->mmap_sem); | 2620 | up_write(&mm->mmap_sem); |
2621 | if (populate) | ||
2622 | mm_populate(addr, len); | ||
2608 | return ret; | 2623 | return ret; |
2609 | } | 2624 | } |
2610 | EXPORT_SYMBOL(vm_brk); | 2625 | EXPORT_SYMBOL(vm_brk); |
@@ -3002,7 +3017,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | |||
3002 | if (!__test_and_clear_bit(0, (unsigned long *) | 3017 | if (!__test_and_clear_bit(0, (unsigned long *) |
3003 | &anon_vma->root->rb_root.rb_node)) | 3018 | &anon_vma->root->rb_root.rb_node)) |
3004 | BUG(); | 3019 | BUG(); |
3005 | anon_vma_unlock(anon_vma); | 3020 | anon_vma_unlock_write(anon_vma); |
3006 | } | 3021 | } |
3007 | } | 3022 | } |
3008 | 3023 | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8a5ac8c686b0..2175fb0d501c 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -37,49 +37,51 @@ static struct srcu_struct srcu; | |||
37 | void __mmu_notifier_release(struct mm_struct *mm) | 37 | void __mmu_notifier_release(struct mm_struct *mm) |
38 | { | 38 | { |
39 | struct mmu_notifier *mn; | 39 | struct mmu_notifier *mn; |
40 | struct hlist_node *n; | ||
41 | int id; | 40 | int id; |
42 | 41 | ||
43 | /* | 42 | /* |
44 | * SRCU here will block mmu_notifier_unregister until | 43 | * srcu_read_lock() here will block synchronize_srcu() in |
45 | * ->release returns. | 44 | * mmu_notifier_unregister() until all registered |
45 | * ->release() callouts this function makes have | ||
46 | * returned. | ||
46 | */ | 47 | */ |
47 | id = srcu_read_lock(&srcu); | 48 | id = srcu_read_lock(&srcu); |
48 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) | ||
49 | /* | ||
50 | * if ->release runs before mmu_notifier_unregister it | ||
51 | * must be handled as it's the only way for the driver | ||
52 | * to flush all existing sptes and stop the driver | ||
53 | * from establishing any more sptes before all the | ||
54 | * pages in the mm are freed. | ||
55 | */ | ||
56 | if (mn->ops->release) | ||
57 | mn->ops->release(mn, mm); | ||
58 | srcu_read_unlock(&srcu, id); | ||
59 | |||
60 | spin_lock(&mm->mmu_notifier_mm->lock); | 49 | spin_lock(&mm->mmu_notifier_mm->lock); |
61 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 50 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
62 | mn = hlist_entry(mm->mmu_notifier_mm->list.first, | 51 | mn = hlist_entry(mm->mmu_notifier_mm->list.first, |
63 | struct mmu_notifier, | 52 | struct mmu_notifier, |
64 | hlist); | 53 | hlist); |
54 | |||
65 | /* | 55 | /* |
66 | * We arrived before mmu_notifier_unregister so | 56 | * Unlink. This will prevent mmu_notifier_unregister() |
67 | * mmu_notifier_unregister will do nothing other than | 57 | * from also making the ->release() callout. |
68 | * to wait ->release to finish and | ||
69 | * mmu_notifier_unregister to return. | ||
70 | */ | 58 | */ |
71 | hlist_del_init_rcu(&mn->hlist); | 59 | hlist_del_init_rcu(&mn->hlist); |
60 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
61 | |||
62 | /* | ||
63 | * Clear sptes. (see 'release' description in mmu_notifier.h) | ||
64 | */ | ||
65 | if (mn->ops->release) | ||
66 | mn->ops->release(mn, mm); | ||
67 | |||
68 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
72 | } | 69 | } |
73 | spin_unlock(&mm->mmu_notifier_mm->lock); | 70 | spin_unlock(&mm->mmu_notifier_mm->lock); |
74 | 71 | ||
75 | /* | 72 | /* |
76 | * synchronize_srcu here prevents mmu_notifier_release to | 73 | * All callouts to ->release() which we have done are complete. |
77 | * return to exit_mmap (which would proceed freeing all pages | 74 | * Allow synchronize_srcu() in mmu_notifier_unregister() to complete |
78 | * in the mm) until the ->release method returns, if it was | 75 | */ |
79 | * invoked by mmu_notifier_unregister. | 76 | srcu_read_unlock(&srcu, id); |
80 | * | 77 | |
81 | * The mmu_notifier_mm can't go away from under us because one | 78 | /* |
82 | * mm_count is hold by exit_mmap. | 79 | * mmu_notifier_unregister() may have unlinked a notifier and may |
80 | * still be calling out to it. Additionally, other notifiers | ||
81 | * may have been active via vmtruncate() et. al. Block here | ||
82 | * to ensure that all notifier callouts for this mm have been | ||
83 | * completed and the sptes are really cleaned up before returning | ||
84 | * to exit_mmap(). | ||
83 | */ | 85 | */ |
84 | synchronize_srcu(&srcu); | 86 | synchronize_srcu(&srcu); |
85 | } | 87 | } |
@@ -170,6 +172,7 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | |||
170 | } | 172 | } |
171 | srcu_read_unlock(&srcu, id); | 173 | srcu_read_unlock(&srcu, id); |
172 | } | 174 | } |
175 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); | ||
173 | 176 | ||
174 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | 177 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, |
175 | unsigned long start, unsigned long end) | 178 | unsigned long start, unsigned long end) |
@@ -185,6 +188,7 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | |||
185 | } | 188 | } |
186 | srcu_read_unlock(&srcu, id); | 189 | srcu_read_unlock(&srcu, id); |
187 | } | 190 | } |
191 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end); | ||
188 | 192 | ||
189 | static int do_mmu_notifier_register(struct mmu_notifier *mn, | 193 | static int do_mmu_notifier_register(struct mmu_notifier *mn, |
190 | struct mm_struct *mm, | 194 | struct mm_struct *mm, |
@@ -294,31 +298,31 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
294 | { | 298 | { |
295 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 299 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
296 | 300 | ||
301 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
297 | if (!hlist_unhashed(&mn->hlist)) { | 302 | if (!hlist_unhashed(&mn->hlist)) { |
298 | /* | ||
299 | * SRCU here will force exit_mmap to wait ->release to finish | ||
300 | * before freeing the pages. | ||
301 | */ | ||
302 | int id; | 303 | int id; |
303 | 304 | ||
304 | id = srcu_read_lock(&srcu); | ||
305 | /* | 305 | /* |
306 | * exit_mmap will block in mmu_notifier_release to | 306 | * Ensure we synchronize up with __mmu_notifier_release(). |
307 | * guarantee ->release is called before freeing the | ||
308 | * pages. | ||
309 | */ | 307 | */ |
308 | id = srcu_read_lock(&srcu); | ||
309 | |||
310 | hlist_del_rcu(&mn->hlist); | ||
311 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
312 | |||
310 | if (mn->ops->release) | 313 | if (mn->ops->release) |
311 | mn->ops->release(mn, mm); | 314 | mn->ops->release(mn, mm); |
312 | srcu_read_unlock(&srcu, id); | ||
313 | 315 | ||
314 | spin_lock(&mm->mmu_notifier_mm->lock); | 316 | /* |
315 | hlist_del_rcu(&mn->hlist); | 317 | * Allow __mmu_notifier_release() to complete. |
318 | */ | ||
319 | srcu_read_unlock(&srcu, id); | ||
320 | } else | ||
316 | spin_unlock(&mm->mmu_notifier_mm->lock); | 321 | spin_unlock(&mm->mmu_notifier_mm->lock); |
317 | } | ||
318 | 322 | ||
319 | /* | 323 | /* |
320 | * Wait any running method to finish, of course including | 324 | * Wait for any running method to finish, including ->release() if it |
321 | * ->release if it was run by mmu_notifier_relase instead of us. | 325 | * was run by __mmu_notifier_release() instead of us. |
322 | */ | 326 | */ |
323 | synchronize_srcu(&srcu); | 327 | synchronize_srcu(&srcu); |
324 | 328 | ||
diff --git a/mm/mmzone.c b/mm/mmzone.c index 4596d81b89b1..2ac0afbd68f3 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * linux/mm/mmzone.c | 2 | * linux/mm/mmzone.c |
3 | * | 3 | * |
4 | * management codes for pgdats and zones. | 4 | * management codes for pgdats, zones and page flags |
5 | */ | 5 | */ |
6 | 6 | ||
7 | 7 | ||
@@ -96,3 +96,21 @@ void lruvec_init(struct lruvec *lruvec) | |||
96 | for_each_lru(lru) | 96 | for_each_lru(lru) |
97 | INIT_LIST_HEAD(&lruvec->lists[lru]); | 97 | INIT_LIST_HEAD(&lruvec->lists[lru]); |
98 | } | 98 | } |
99 | |||
100 | #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) | ||
101 | int page_nid_xchg_last(struct page *page, int nid) | ||
102 | { | ||
103 | unsigned long old_flags, flags; | ||
104 | int last_nid; | ||
105 | |||
106 | do { | ||
107 | old_flags = flags = page->flags; | ||
108 | last_nid = page_nid_last(page); | ||
109 | |||
110 | flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); | ||
111 | flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; | ||
112 | } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); | ||
113 | |||
114 | return last_nid; | ||
115 | } | ||
116 | #endif | ||
diff --git a/mm/mremap.c b/mm/mremap.c index f9766f460299..463a25705ac6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -135,7 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
135 | pte_unmap(new_pte - 1); | 135 | pte_unmap(new_pte - 1); |
136 | pte_unmap_unlock(old_pte - 1, old_ptl); | 136 | pte_unmap_unlock(old_pte - 1, old_ptl); |
137 | if (anon_vma) | 137 | if (anon_vma) |
138 | anon_vma_unlock(anon_vma); | 138 | anon_vma_unlock_write(anon_vma); |
139 | if (mapping) | 139 | if (mapping) |
140 | mutex_unlock(&mapping->i_mmap_mutex); | 140 | mutex_unlock(&mapping->i_mmap_mutex); |
141 | } | 141 | } |
@@ -209,7 +209,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
209 | 209 | ||
210 | static unsigned long move_vma(struct vm_area_struct *vma, | 210 | static unsigned long move_vma(struct vm_area_struct *vma, |
211 | unsigned long old_addr, unsigned long old_len, | 211 | unsigned long old_addr, unsigned long old_len, |
212 | unsigned long new_len, unsigned long new_addr) | 212 | unsigned long new_len, unsigned long new_addr, bool *locked) |
213 | { | 213 | { |
214 | struct mm_struct *mm = vma->vm_mm; | 214 | struct mm_struct *mm = vma->vm_mm; |
215 | struct vm_area_struct *new_vma; | 215 | struct vm_area_struct *new_vma; |
@@ -300,9 +300,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
300 | 300 | ||
301 | if (vm_flags & VM_LOCKED) { | 301 | if (vm_flags & VM_LOCKED) { |
302 | mm->locked_vm += new_len >> PAGE_SHIFT; | 302 | mm->locked_vm += new_len >> PAGE_SHIFT; |
303 | if (new_len > old_len) | 303 | *locked = true; |
304 | mlock_vma_pages_range(new_vma, new_addr + old_len, | ||
305 | new_addr + new_len); | ||
306 | } | 304 | } |
307 | 305 | ||
308 | return new_addr; | 306 | return new_addr; |
@@ -367,9 +365,8 @@ Eagain: | |||
367 | return ERR_PTR(-EAGAIN); | 365 | return ERR_PTR(-EAGAIN); |
368 | } | 366 | } |
369 | 367 | ||
370 | static unsigned long mremap_to(unsigned long addr, | 368 | static unsigned long mremap_to(unsigned long addr, unsigned long old_len, |
371 | unsigned long old_len, unsigned long new_addr, | 369 | unsigned long new_addr, unsigned long new_len, bool *locked) |
372 | unsigned long new_len) | ||
373 | { | 370 | { |
374 | struct mm_struct *mm = current->mm; | 371 | struct mm_struct *mm = current->mm; |
375 | struct vm_area_struct *vma; | 372 | struct vm_area_struct *vma; |
@@ -419,7 +416,7 @@ static unsigned long mremap_to(unsigned long addr, | |||
419 | if (ret & ~PAGE_MASK) | 416 | if (ret & ~PAGE_MASK) |
420 | goto out1; | 417 | goto out1; |
421 | 418 | ||
422 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | 419 | ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); |
423 | if (!(ret & ~PAGE_MASK)) | 420 | if (!(ret & ~PAGE_MASK)) |
424 | goto out; | 421 | goto out; |
425 | out1: | 422 | out1: |
@@ -457,6 +454,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
457 | struct vm_area_struct *vma; | 454 | struct vm_area_struct *vma; |
458 | unsigned long ret = -EINVAL; | 455 | unsigned long ret = -EINVAL; |
459 | unsigned long charged = 0; | 456 | unsigned long charged = 0; |
457 | bool locked = false; | ||
460 | 458 | ||
461 | down_write(¤t->mm->mmap_sem); | 459 | down_write(¤t->mm->mmap_sem); |
462 | 460 | ||
@@ -479,7 +477,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
479 | 477 | ||
480 | if (flags & MREMAP_FIXED) { | 478 | if (flags & MREMAP_FIXED) { |
481 | if (flags & MREMAP_MAYMOVE) | 479 | if (flags & MREMAP_MAYMOVE) |
482 | ret = mremap_to(addr, old_len, new_addr, new_len); | 480 | ret = mremap_to(addr, old_len, new_addr, new_len, |
481 | &locked); | ||
483 | goto out; | 482 | goto out; |
484 | } | 483 | } |
485 | 484 | ||
@@ -521,8 +520,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
521 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); | 520 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
522 | if (vma->vm_flags & VM_LOCKED) { | 521 | if (vma->vm_flags & VM_LOCKED) { |
523 | mm->locked_vm += pages; | 522 | mm->locked_vm += pages; |
524 | mlock_vma_pages_range(vma, addr + old_len, | 523 | locked = true; |
525 | addr + new_len); | 524 | new_addr = addr; |
526 | } | 525 | } |
527 | ret = addr; | 526 | ret = addr; |
528 | goto out; | 527 | goto out; |
@@ -548,11 +547,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
548 | goto out; | 547 | goto out; |
549 | } | 548 | } |
550 | 549 | ||
551 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | 550 | ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); |
552 | } | 551 | } |
553 | out: | 552 | out: |
554 | if (ret & ~PAGE_MASK) | 553 | if (ret & ~PAGE_MASK) |
555 | vm_unacct_memory(charged); | 554 | vm_unacct_memory(charged); |
556 | up_write(¤t->mm->mmap_sem); | 555 | up_write(¤t->mm->mmap_sem); |
556 | if (locked && new_len > old_len) | ||
557 | mm_populate(new_addr + old_len, new_len - old_len); | ||
557 | return ret; | 558 | return ret; |
558 | } | 559 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index b20db4e22263..da0d210fd403 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -140,10 +140,10 @@ unsigned int kobjsize(const void *objp) | |||
140 | return PAGE_SIZE << compound_order(page); | 140 | return PAGE_SIZE << compound_order(page); |
141 | } | 141 | } |
142 | 142 | ||
143 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 143 | long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
144 | unsigned long start, int nr_pages, unsigned int foll_flags, | 144 | unsigned long start, unsigned long nr_pages, |
145 | struct page **pages, struct vm_area_struct **vmas, | 145 | unsigned int foll_flags, struct page **pages, |
146 | int *retry) | 146 | struct vm_area_struct **vmas, int *nonblocking) |
147 | { | 147 | { |
148 | struct vm_area_struct *vma; | 148 | struct vm_area_struct *vma; |
149 | unsigned long vm_flags; | 149 | unsigned long vm_flags; |
@@ -190,9 +190,10 @@ finish_or_fault: | |||
190 | * slab page or a secondary page from a compound page | 190 | * slab page or a secondary page from a compound page |
191 | * - don't permit access to VMAs that don't support it, such as I/O mappings | 191 | * - don't permit access to VMAs that don't support it, such as I/O mappings |
192 | */ | 192 | */ |
193 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 193 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
194 | unsigned long start, int nr_pages, int write, int force, | 194 | unsigned long start, unsigned long nr_pages, |
195 | struct page **pages, struct vm_area_struct **vmas) | 195 | int write, int force, struct page **pages, |
196 | struct vm_area_struct **vmas) | ||
196 | { | 197 | { |
197 | int flags = 0; | 198 | int flags = 0; |
198 | 199 | ||
@@ -1250,7 +1251,8 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1250 | unsigned long len, | 1251 | unsigned long len, |
1251 | unsigned long prot, | 1252 | unsigned long prot, |
1252 | unsigned long flags, | 1253 | unsigned long flags, |
1253 | unsigned long pgoff) | 1254 | unsigned long pgoff, |
1255 | unsigned long *populate) | ||
1254 | { | 1256 | { |
1255 | struct vm_area_struct *vma; | 1257 | struct vm_area_struct *vma; |
1256 | struct vm_region *region; | 1258 | struct vm_region *region; |
@@ -1260,6 +1262,8 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1260 | 1262 | ||
1261 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); | 1263 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); |
1262 | 1264 | ||
1265 | *populate = 0; | ||
1266 | |||
1263 | /* decide whether we should attempt the mapping, and if so what sort of | 1267 | /* decide whether we should attempt the mapping, and if so what sort of |
1264 | * mapping */ | 1268 | * mapping */ |
1265 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, | 1269 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, |
@@ -1815,9 +1819,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
1815 | return ret; | 1819 | return ret; |
1816 | } | 1820 | } |
1817 | 1821 | ||
1818 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | 1822 | struct page *follow_page_mask(struct vm_area_struct *vma, |
1819 | unsigned int foll_flags) | 1823 | unsigned long address, unsigned int flags, |
1824 | unsigned int *page_mask) | ||
1820 | { | 1825 | { |
1826 | *page_mask = 0; | ||
1821 | return NULL; | 1827 | return NULL; |
1822 | } | 1828 | } |
1823 | 1829 | ||
@@ -1904,7 +1910,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1904 | */ | 1910 | */ |
1905 | free -= global_page_state(NR_SHMEM); | 1911 | free -= global_page_state(NR_SHMEM); |
1906 | 1912 | ||
1907 | free += nr_swap_pages; | 1913 | free += get_nr_swap_pages(); |
1908 | 1914 | ||
1909 | /* | 1915 | /* |
1910 | * Any slabs which are created with the | 1916 | * Any slabs which are created with the |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0399f146ae49..79e451a78c9e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -386,8 +386,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
386 | cpuset_print_task_mems_allowed(current); | 386 | cpuset_print_task_mems_allowed(current); |
387 | task_unlock(current); | 387 | task_unlock(current); |
388 | dump_stack(); | 388 | dump_stack(); |
389 | mem_cgroup_print_oom_info(memcg, p); | 389 | if (memcg) |
390 | show_mem(SHOW_MEM_FILTER_NODES); | 390 | mem_cgroup_print_oom_info(memcg, p); |
391 | else | ||
392 | show_mem(SHOW_MEM_FILTER_NODES); | ||
391 | if (sysctl_oom_dump_tasks) | 393 | if (sysctl_oom_dump_tasks) |
392 | dump_tasks(memcg, nodemask); | 394 | dump_tasks(memcg, nodemask); |
393 | } | 395 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7300c9d5e1d9..cdc377c456c0 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -241,6 +241,9 @@ static unsigned long global_dirtyable_memory(void) | |||
241 | if (!vm_highmem_is_dirtyable) | 241 | if (!vm_highmem_is_dirtyable) |
242 | x -= highmem_dirtyable_memory(x); | 242 | x -= highmem_dirtyable_memory(x); |
243 | 243 | ||
244 | /* Subtract min_free_kbytes */ | ||
245 | x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10)); | ||
246 | |||
244 | return x + 1; /* Ensure that we never return 0 */ | 247 | return x + 1; /* Ensure that we never return 0 */ |
245 | } | 248 | } |
246 | 249 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1107adf174a..e9075fdef695 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -202,11 +202,18 @@ static unsigned long __meminitdata nr_all_pages; | |||
202 | static unsigned long __meminitdata dma_reserve; | 202 | static unsigned long __meminitdata dma_reserve; |
203 | 203 | ||
204 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 204 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
205 | /* Movable memory ranges, will also be used by memblock subsystem. */ | ||
206 | struct movablemem_map movablemem_map = { | ||
207 | .acpi = false, | ||
208 | .nr_map = 0, | ||
209 | }; | ||
210 | |||
205 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | 211 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; |
206 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | 212 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; |
207 | static unsigned long __initdata required_kernelcore; | 213 | static unsigned long __initdata required_kernelcore; |
208 | static unsigned long __initdata required_movablecore; | 214 | static unsigned long __initdata required_movablecore; |
209 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 215 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
216 | static unsigned long __meminitdata zone_movable_limit[MAX_NUMNODES]; | ||
210 | 217 | ||
211 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 218 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
212 | int movable_zone; | 219 | int movable_zone; |
@@ -240,15 +247,20 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | |||
240 | int ret = 0; | 247 | int ret = 0; |
241 | unsigned seq; | 248 | unsigned seq; |
242 | unsigned long pfn = page_to_pfn(page); | 249 | unsigned long pfn = page_to_pfn(page); |
250 | unsigned long sp, start_pfn; | ||
243 | 251 | ||
244 | do { | 252 | do { |
245 | seq = zone_span_seqbegin(zone); | 253 | seq = zone_span_seqbegin(zone); |
246 | if (pfn >= zone->zone_start_pfn + zone->spanned_pages) | 254 | start_pfn = zone->zone_start_pfn; |
247 | ret = 1; | 255 | sp = zone->spanned_pages; |
248 | else if (pfn < zone->zone_start_pfn) | 256 | if (!zone_spans_pfn(zone, pfn)) |
249 | ret = 1; | 257 | ret = 1; |
250 | } while (zone_span_seqretry(zone, seq)); | 258 | } while (zone_span_seqretry(zone, seq)); |
251 | 259 | ||
260 | if (ret) | ||
261 | pr_err("page %lu outside zone [ %lu - %lu ]\n", | ||
262 | pfn, start_pfn, start_pfn + sp); | ||
263 | |||
252 | return ret; | 264 | return ret; |
253 | } | 265 | } |
254 | 266 | ||
@@ -288,7 +300,7 @@ static void bad_page(struct page *page) | |||
288 | 300 | ||
289 | /* Don't complain about poisoned pages */ | 301 | /* Don't complain about poisoned pages */ |
290 | if (PageHWPoison(page)) { | 302 | if (PageHWPoison(page)) { |
291 | reset_page_mapcount(page); /* remove PageBuddy */ | 303 | page_mapcount_reset(page); /* remove PageBuddy */ |
292 | return; | 304 | return; |
293 | } | 305 | } |
294 | 306 | ||
@@ -320,7 +332,7 @@ static void bad_page(struct page *page) | |||
320 | dump_stack(); | 332 | dump_stack(); |
321 | out: | 333 | out: |
322 | /* Leave bad fields for debug, except PageBuddy could make trouble */ | 334 | /* Leave bad fields for debug, except PageBuddy could make trouble */ |
323 | reset_page_mapcount(page); /* remove PageBuddy */ | 335 | page_mapcount_reset(page); /* remove PageBuddy */ |
324 | add_taint(TAINT_BAD_PAGE); | 336 | add_taint(TAINT_BAD_PAGE); |
325 | } | 337 | } |
326 | 338 | ||
@@ -533,6 +545,8 @@ static inline void __free_one_page(struct page *page, | |||
533 | unsigned long uninitialized_var(buddy_idx); | 545 | unsigned long uninitialized_var(buddy_idx); |
534 | struct page *buddy; | 546 | struct page *buddy; |
535 | 547 | ||
548 | VM_BUG_ON(!zone_is_initialized(zone)); | ||
549 | |||
536 | if (unlikely(PageCompound(page))) | 550 | if (unlikely(PageCompound(page))) |
537 | if (unlikely(destroy_compound_page(page, order))) | 551 | if (unlikely(destroy_compound_page(page, order))) |
538 | return; | 552 | return; |
@@ -606,7 +620,7 @@ static inline int free_pages_check(struct page *page) | |||
606 | bad_page(page); | 620 | bad_page(page); |
607 | return 1; | 621 | return 1; |
608 | } | 622 | } |
609 | reset_page_last_nid(page); | 623 | page_nid_reset_last(page); |
610 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 624 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
611 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 625 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
612 | return 0; | 626 | return 0; |
@@ -666,7 +680,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
666 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 680 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
667 | __free_one_page(page, zone, 0, mt); | 681 | __free_one_page(page, zone, 0, mt); |
668 | trace_mm_page_pcpu_drain(page, 0, mt); | 682 | trace_mm_page_pcpu_drain(page, 0, mt); |
669 | if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { | 683 | if (likely(!is_migrate_isolate_page(page))) { |
670 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); | 684 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); |
671 | if (is_migrate_cma(mt)) | 685 | if (is_migrate_cma(mt)) |
672 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | 686 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); |
@@ -684,7 +698,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
684 | zone->pages_scanned = 0; | 698 | zone->pages_scanned = 0; |
685 | 699 | ||
686 | __free_one_page(page, zone, order, migratetype); | 700 | __free_one_page(page, zone, order, migratetype); |
687 | if (unlikely(migratetype != MIGRATE_ISOLATE)) | 701 | if (unlikely(!is_migrate_isolate(migratetype))) |
688 | __mod_zone_freepage_state(zone, 1 << order, migratetype); | 702 | __mod_zone_freepage_state(zone, 1 << order, migratetype); |
689 | spin_unlock(&zone->lock); | 703 | spin_unlock(&zone->lock); |
690 | } | 704 | } |
@@ -916,7 +930,9 @@ static int fallbacks[MIGRATE_TYPES][4] = { | |||
916 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | 930 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, |
917 | #endif | 931 | #endif |
918 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ | 932 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ |
933 | #ifdef CONFIG_MEMORY_ISOLATION | ||
919 | [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ | 934 | [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ |
935 | #endif | ||
920 | }; | 936 | }; |
921 | 937 | ||
922 | /* | 938 | /* |
@@ -981,9 +997,9 @@ int move_freepages_block(struct zone *zone, struct page *page, | |||
981 | end_pfn = start_pfn + pageblock_nr_pages - 1; | 997 | end_pfn = start_pfn + pageblock_nr_pages - 1; |
982 | 998 | ||
983 | /* Do not cross zone boundaries */ | 999 | /* Do not cross zone boundaries */ |
984 | if (start_pfn < zone->zone_start_pfn) | 1000 | if (!zone_spans_pfn(zone, start_pfn)) |
985 | start_page = page; | 1001 | start_page = page; |
986 | if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) | 1002 | if (!zone_spans_pfn(zone, end_pfn)) |
987 | return 0; | 1003 | return 0; |
988 | 1004 | ||
989 | return move_freepages(zone, start_page, end_page, migratetype); | 1005 | return move_freepages(zone, start_page, end_page, migratetype); |
@@ -1142,7 +1158,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
1142 | list_add_tail(&page->lru, list); | 1158 | list_add_tail(&page->lru, list); |
1143 | if (IS_ENABLED(CONFIG_CMA)) { | 1159 | if (IS_ENABLED(CONFIG_CMA)) { |
1144 | mt = get_pageblock_migratetype(page); | 1160 | mt = get_pageblock_migratetype(page); |
1145 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) | 1161 | if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) |
1146 | mt = migratetype; | 1162 | mt = migratetype; |
1147 | } | 1163 | } |
1148 | set_freepage_migratetype(page, mt); | 1164 | set_freepage_migratetype(page, mt); |
@@ -1277,7 +1293,7 @@ void mark_free_pages(struct zone *zone) | |||
1277 | 1293 | ||
1278 | spin_lock_irqsave(&zone->lock, flags); | 1294 | spin_lock_irqsave(&zone->lock, flags); |
1279 | 1295 | ||
1280 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 1296 | max_zone_pfn = zone_end_pfn(zone); |
1281 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1297 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
1282 | if (pfn_valid(pfn)) { | 1298 | if (pfn_valid(pfn)) { |
1283 | struct page *page = pfn_to_page(pfn); | 1299 | struct page *page = pfn_to_page(pfn); |
@@ -1326,7 +1342,7 @@ void free_hot_cold_page(struct page *page, int cold) | |||
1326 | * excessively into the page allocator | 1342 | * excessively into the page allocator |
1327 | */ | 1343 | */ |
1328 | if (migratetype >= MIGRATE_PCPTYPES) { | 1344 | if (migratetype >= MIGRATE_PCPTYPES) { |
1329 | if (unlikely(migratetype == MIGRATE_ISOLATE)) { | 1345 | if (unlikely(is_migrate_isolate(migratetype))) { |
1330 | free_one_page(zone, page, 0, migratetype); | 1346 | free_one_page(zone, page, 0, migratetype); |
1331 | goto out; | 1347 | goto out; |
1332 | } | 1348 | } |
@@ -1400,7 +1416,7 @@ static int __isolate_free_page(struct page *page, unsigned int order) | |||
1400 | zone = page_zone(page); | 1416 | zone = page_zone(page); |
1401 | mt = get_pageblock_migratetype(page); | 1417 | mt = get_pageblock_migratetype(page); |
1402 | 1418 | ||
1403 | if (mt != MIGRATE_ISOLATE) { | 1419 | if (!is_migrate_isolate(mt)) { |
1404 | /* Obey watermarks as if the page was being allocated */ | 1420 | /* Obey watermarks as if the page was being allocated */ |
1405 | watermark = low_wmark_pages(zone) + (1 << order); | 1421 | watermark = low_wmark_pages(zone) + (1 << order); |
1406 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 1422 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) |
@@ -1419,7 +1435,7 @@ static int __isolate_free_page(struct page *page, unsigned int order) | |||
1419 | struct page *endpage = page + (1 << order) - 1; | 1435 | struct page *endpage = page + (1 << order) - 1; |
1420 | for (; page < endpage; page += pageblock_nr_pages) { | 1436 | for (; page < endpage; page += pageblock_nr_pages) { |
1421 | int mt = get_pageblock_migratetype(page); | 1437 | int mt = get_pageblock_migratetype(page); |
1422 | if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) | 1438 | if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) |
1423 | set_pageblock_migratetype(page, | 1439 | set_pageblock_migratetype(page, |
1424 | MIGRATE_MOVABLE); | 1440 | MIGRATE_MOVABLE); |
1425 | } | 1441 | } |
@@ -2615,10 +2631,17 @@ retry_cpuset: | |||
2615 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2631 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2616 | zonelist, high_zoneidx, alloc_flags, | 2632 | zonelist, high_zoneidx, alloc_flags, |
2617 | preferred_zone, migratetype); | 2633 | preferred_zone, migratetype); |
2618 | if (unlikely(!page)) | 2634 | if (unlikely(!page)) { |
2635 | /* | ||
2636 | * Runtime PM, block IO and its error handling path | ||
2637 | * can deadlock because I/O on the device might not | ||
2638 | * complete. | ||
2639 | */ | ||
2640 | gfp_mask = memalloc_noio_flags(gfp_mask); | ||
2619 | page = __alloc_pages_slowpath(gfp_mask, order, | 2641 | page = __alloc_pages_slowpath(gfp_mask, order, |
2620 | zonelist, high_zoneidx, nodemask, | 2642 | zonelist, high_zoneidx, nodemask, |
2621 | preferred_zone, migratetype); | 2643 | preferred_zone, migratetype); |
2644 | } | ||
2622 | 2645 | ||
2623 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2646 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
2624 | 2647 | ||
@@ -2790,18 +2813,27 @@ void free_pages_exact(void *virt, size_t size) | |||
2790 | } | 2813 | } |
2791 | EXPORT_SYMBOL(free_pages_exact); | 2814 | EXPORT_SYMBOL(free_pages_exact); |
2792 | 2815 | ||
2793 | static unsigned int nr_free_zone_pages(int offset) | 2816 | /** |
2817 | * nr_free_zone_pages - count number of pages beyond high watermark | ||
2818 | * @offset: The zone index of the highest zone | ||
2819 | * | ||
2820 | * nr_free_zone_pages() counts the number of counts pages which are beyond the | ||
2821 | * high watermark within all zones at or below a given zone index. For each | ||
2822 | * zone, the number of pages is calculated as: | ||
2823 | * present_pages - high_pages | ||
2824 | */ | ||
2825 | static unsigned long nr_free_zone_pages(int offset) | ||
2794 | { | 2826 | { |
2795 | struct zoneref *z; | 2827 | struct zoneref *z; |
2796 | struct zone *zone; | 2828 | struct zone *zone; |
2797 | 2829 | ||
2798 | /* Just pick one node, since fallback list is circular */ | 2830 | /* Just pick one node, since fallback list is circular */ |
2799 | unsigned int sum = 0; | 2831 | unsigned long sum = 0; |
2800 | 2832 | ||
2801 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); | 2833 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); |
2802 | 2834 | ||
2803 | for_each_zone_zonelist(zone, z, zonelist, offset) { | 2835 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
2804 | unsigned long size = zone->present_pages; | 2836 | unsigned long size = zone->managed_pages; |
2805 | unsigned long high = high_wmark_pages(zone); | 2837 | unsigned long high = high_wmark_pages(zone); |
2806 | if (size > high) | 2838 | if (size > high) |
2807 | sum += size - high; | 2839 | sum += size - high; |
@@ -2810,19 +2842,25 @@ static unsigned int nr_free_zone_pages(int offset) | |||
2810 | return sum; | 2842 | return sum; |
2811 | } | 2843 | } |
2812 | 2844 | ||
2813 | /* | 2845 | /** |
2814 | * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL | 2846 | * nr_free_buffer_pages - count number of pages beyond high watermark |
2847 | * | ||
2848 | * nr_free_buffer_pages() counts the number of pages which are beyond the high | ||
2849 | * watermark within ZONE_DMA and ZONE_NORMAL. | ||
2815 | */ | 2850 | */ |
2816 | unsigned int nr_free_buffer_pages(void) | 2851 | unsigned long nr_free_buffer_pages(void) |
2817 | { | 2852 | { |
2818 | return nr_free_zone_pages(gfp_zone(GFP_USER)); | 2853 | return nr_free_zone_pages(gfp_zone(GFP_USER)); |
2819 | } | 2854 | } |
2820 | EXPORT_SYMBOL_GPL(nr_free_buffer_pages); | 2855 | EXPORT_SYMBOL_GPL(nr_free_buffer_pages); |
2821 | 2856 | ||
2822 | /* | 2857 | /** |
2823 | * Amount of free RAM allocatable within all zones | 2858 | * nr_free_pagecache_pages - count number of pages beyond high watermark |
2859 | * | ||
2860 | * nr_free_pagecache_pages() counts the number of pages which are beyond the | ||
2861 | * high watermark within all zones. | ||
2824 | */ | 2862 | */ |
2825 | unsigned int nr_free_pagecache_pages(void) | 2863 | unsigned long nr_free_pagecache_pages(void) |
2826 | { | 2864 | { |
2827 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); | 2865 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); |
2828 | } | 2866 | } |
@@ -2854,7 +2892,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
2854 | val->totalram = pgdat->node_present_pages; | 2892 | val->totalram = pgdat->node_present_pages; |
2855 | val->freeram = node_page_state(nid, NR_FREE_PAGES); | 2893 | val->freeram = node_page_state(nid, NR_FREE_PAGES); |
2856 | #ifdef CONFIG_HIGHMEM | 2894 | #ifdef CONFIG_HIGHMEM |
2857 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; | 2895 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; |
2858 | val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], | 2896 | val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], |
2859 | NR_FREE_PAGES); | 2897 | NR_FREE_PAGES); |
2860 | #else | 2898 | #else |
@@ -2897,7 +2935,9 @@ static void show_migration_types(unsigned char type) | |||
2897 | #ifdef CONFIG_CMA | 2935 | #ifdef CONFIG_CMA |
2898 | [MIGRATE_CMA] = 'C', | 2936 | [MIGRATE_CMA] = 'C', |
2899 | #endif | 2937 | #endif |
2938 | #ifdef CONFIG_MEMORY_ISOLATION | ||
2900 | [MIGRATE_ISOLATE] = 'I', | 2939 | [MIGRATE_ISOLATE] = 'I', |
2940 | #endif | ||
2901 | }; | 2941 | }; |
2902 | char tmp[MIGRATE_TYPES + 1]; | 2942 | char tmp[MIGRATE_TYPES + 1]; |
2903 | char *p = tmp; | 2943 | char *p = tmp; |
@@ -3236,7 +3276,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
3236 | { | 3276 | { |
3237 | int n, val; | 3277 | int n, val; |
3238 | int min_val = INT_MAX; | 3278 | int min_val = INT_MAX; |
3239 | int best_node = -1; | 3279 | int best_node = NUMA_NO_NODE; |
3240 | const struct cpumask *tmp = cpumask_of_node(0); | 3280 | const struct cpumask *tmp = cpumask_of_node(0); |
3241 | 3281 | ||
3242 | /* Use the local node if we haven't already */ | 3282 | /* Use the local node if we haven't already */ |
@@ -3780,7 +3820,7 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3780 | * the block. | 3820 | * the block. |
3781 | */ | 3821 | */ |
3782 | start_pfn = zone->zone_start_pfn; | 3822 | start_pfn = zone->zone_start_pfn; |
3783 | end_pfn = start_pfn + zone->spanned_pages; | 3823 | end_pfn = zone_end_pfn(zone); |
3784 | start_pfn = roundup(start_pfn, pageblock_nr_pages); | 3824 | start_pfn = roundup(start_pfn, pageblock_nr_pages); |
3785 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> | 3825 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> |
3786 | pageblock_order; | 3826 | pageblock_order; |
@@ -3876,8 +3916,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
3876 | set_page_links(page, zone, nid, pfn); | 3916 | set_page_links(page, zone, nid, pfn); |
3877 | mminit_verify_page_links(page, zone, nid, pfn); | 3917 | mminit_verify_page_links(page, zone, nid, pfn); |
3878 | init_page_count(page); | 3918 | init_page_count(page); |
3879 | reset_page_mapcount(page); | 3919 | page_mapcount_reset(page); |
3880 | reset_page_last_nid(page); | 3920 | page_nid_reset_last(page); |
3881 | SetPageReserved(page); | 3921 | SetPageReserved(page); |
3882 | /* | 3922 | /* |
3883 | * Mark the block movable so that blocks are reserved for | 3923 | * Mark the block movable so that blocks are reserved for |
@@ -3894,7 +3934,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
3894 | * pfn out of zone. | 3934 | * pfn out of zone. |
3895 | */ | 3935 | */ |
3896 | if ((z->zone_start_pfn <= pfn) | 3936 | if ((z->zone_start_pfn <= pfn) |
3897 | && (pfn < z->zone_start_pfn + z->spanned_pages) | 3937 | && (pfn < zone_end_pfn(z)) |
3898 | && !(pfn & (pageblock_nr_pages - 1))) | 3938 | && !(pfn & (pageblock_nr_pages - 1))) |
3899 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 3939 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
3900 | 3940 | ||
@@ -3932,7 +3972,7 @@ static int __meminit zone_batchsize(struct zone *zone) | |||
3932 | * | 3972 | * |
3933 | * OK, so we don't know how big the cache is. So guess. | 3973 | * OK, so we don't know how big the cache is. So guess. |
3934 | */ | 3974 | */ |
3935 | batch = zone->present_pages / 1024; | 3975 | batch = zone->managed_pages / 1024; |
3936 | if (batch * PAGE_SIZE > 512 * 1024) | 3976 | if (batch * PAGE_SIZE > 512 * 1024) |
3937 | batch = (512 * 1024) / PAGE_SIZE; | 3977 | batch = (512 * 1024) / PAGE_SIZE; |
3938 | batch /= 4; /* We effectively *= 4 below */ | 3978 | batch /= 4; /* We effectively *= 4 below */ |
@@ -4016,7 +4056,7 @@ static void __meminit setup_zone_pageset(struct zone *zone) | |||
4016 | 4056 | ||
4017 | if (percpu_pagelist_fraction) | 4057 | if (percpu_pagelist_fraction) |
4018 | setup_pagelist_highmark(pcp, | 4058 | setup_pagelist_highmark(pcp, |
4019 | (zone->present_pages / | 4059 | (zone->managed_pages / |
4020 | percpu_pagelist_fraction)); | 4060 | percpu_pagelist_fraction)); |
4021 | } | 4061 | } |
4022 | } | 4062 | } |
@@ -4372,6 +4412,77 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
4372 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); | 4412 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); |
4373 | } | 4413 | } |
4374 | 4414 | ||
4415 | /** | ||
4416 | * sanitize_zone_movable_limit - Sanitize the zone_movable_limit array. | ||
4417 | * | ||
4418 | * zone_movable_limit is initialized as 0. This function will try to get | ||
4419 | * the first ZONE_MOVABLE pfn of each node from movablemem_map, and | ||
4420 | * assigne them to zone_movable_limit. | ||
4421 | * zone_movable_limit[nid] == 0 means no limit for the node. | ||
4422 | * | ||
4423 | * Note: Each range is represented as [start_pfn, end_pfn) | ||
4424 | */ | ||
4425 | static void __meminit sanitize_zone_movable_limit(void) | ||
4426 | { | ||
4427 | int map_pos = 0, i, nid; | ||
4428 | unsigned long start_pfn, end_pfn; | ||
4429 | |||
4430 | if (!movablemem_map.nr_map) | ||
4431 | return; | ||
4432 | |||
4433 | /* Iterate all ranges from minimum to maximum */ | ||
4434 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { | ||
4435 | /* | ||
4436 | * If we have found lowest pfn of ZONE_MOVABLE of the node | ||
4437 | * specified by user, just go on to check next range. | ||
4438 | */ | ||
4439 | if (zone_movable_limit[nid]) | ||
4440 | continue; | ||
4441 | |||
4442 | #ifdef CONFIG_ZONE_DMA | ||
4443 | /* Skip DMA memory. */ | ||
4444 | if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA]) | ||
4445 | start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA]; | ||
4446 | #endif | ||
4447 | |||
4448 | #ifdef CONFIG_ZONE_DMA32 | ||
4449 | /* Skip DMA32 memory. */ | ||
4450 | if (start_pfn < arch_zone_highest_possible_pfn[ZONE_DMA32]) | ||
4451 | start_pfn = arch_zone_highest_possible_pfn[ZONE_DMA32]; | ||
4452 | #endif | ||
4453 | |||
4454 | #ifdef CONFIG_HIGHMEM | ||
4455 | /* Skip lowmem if ZONE_MOVABLE is highmem. */ | ||
4456 | if (zone_movable_is_highmem() && | ||
4457 | start_pfn < arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]) | ||
4458 | start_pfn = arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]; | ||
4459 | #endif | ||
4460 | |||
4461 | if (start_pfn >= end_pfn) | ||
4462 | continue; | ||
4463 | |||
4464 | while (map_pos < movablemem_map.nr_map) { | ||
4465 | if (end_pfn <= movablemem_map.map[map_pos].start_pfn) | ||
4466 | break; | ||
4467 | |||
4468 | if (start_pfn >= movablemem_map.map[map_pos].end_pfn) { | ||
4469 | map_pos++; | ||
4470 | continue; | ||
4471 | } | ||
4472 | |||
4473 | /* | ||
4474 | * The start_pfn of ZONE_MOVABLE is either the minimum | ||
4475 | * pfn specified by movablemem_map, or 0, which means | ||
4476 | * the node has no ZONE_MOVABLE. | ||
4477 | */ | ||
4478 | zone_movable_limit[nid] = max(start_pfn, | ||
4479 | movablemem_map.map[map_pos].start_pfn); | ||
4480 | |||
4481 | break; | ||
4482 | } | ||
4483 | } | ||
4484 | } | ||
4485 | |||
4375 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4486 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4376 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, | 4487 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, |
4377 | unsigned long zone_type, | 4488 | unsigned long zone_type, |
@@ -4389,7 +4500,6 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
4389 | 4500 | ||
4390 | return zholes_size[zone_type]; | 4501 | return zholes_size[zone_type]; |
4391 | } | 4502 | } |
4392 | |||
4393 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4503 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4394 | 4504 | ||
4395 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | 4505 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, |
@@ -4573,7 +4683,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4573 | nr_all_pages += freesize; | 4683 | nr_all_pages += freesize; |
4574 | 4684 | ||
4575 | zone->spanned_pages = size; | 4685 | zone->spanned_pages = size; |
4576 | zone->present_pages = freesize; | 4686 | zone->present_pages = realsize; |
4577 | /* | 4687 | /* |
4578 | * Set an approximate value for lowmem here, it will be adjusted | 4688 | * Set an approximate value for lowmem here, it will be adjusted |
4579 | * when the bootmem allocator frees pages into the buddy system. | 4689 | * when the bootmem allocator frees pages into the buddy system. |
@@ -4625,7 +4735,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
4625 | * for the buddy allocator to function correctly. | 4735 | * for the buddy allocator to function correctly. |
4626 | */ | 4736 | */ |
4627 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); | 4737 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); |
4628 | end = pgdat->node_start_pfn + pgdat->node_spanned_pages; | 4738 | end = pgdat_end_pfn(pgdat); |
4629 | end = ALIGN(end, MAX_ORDER_NR_PAGES); | 4739 | end = ALIGN(end, MAX_ORDER_NR_PAGES); |
4630 | size = (end - start) * sizeof(struct page); | 4740 | size = (end - start) * sizeof(struct page); |
4631 | map = alloc_remap(pgdat->node_id, size); | 4741 | map = alloc_remap(pgdat->node_id, size); |
@@ -4831,12 +4941,19 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
4831 | required_kernelcore = max(required_kernelcore, corepages); | 4941 | required_kernelcore = max(required_kernelcore, corepages); |
4832 | } | 4942 | } |
4833 | 4943 | ||
4834 | /* If kernelcore was not specified, there is no ZONE_MOVABLE */ | 4944 | /* |
4835 | if (!required_kernelcore) | 4945 | * If neither kernelcore/movablecore nor movablemem_map is specified, |
4946 | * there is no ZONE_MOVABLE. But if movablemem_map is specified, the | ||
4947 | * start pfn of ZONE_MOVABLE has been stored in zone_movable_limit[]. | ||
4948 | */ | ||
4949 | if (!required_kernelcore) { | ||
4950 | if (movablemem_map.nr_map) | ||
4951 | memcpy(zone_movable_pfn, zone_movable_limit, | ||
4952 | sizeof(zone_movable_pfn)); | ||
4836 | goto out; | 4953 | goto out; |
4954 | } | ||
4837 | 4955 | ||
4838 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ | 4956 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ |
4839 | find_usable_zone_for_movable(); | ||
4840 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; | 4957 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; |
4841 | 4958 | ||
4842 | restart: | 4959 | restart: |
@@ -4864,10 +4981,24 @@ restart: | |||
4864 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { | 4981 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { |
4865 | unsigned long size_pages; | 4982 | unsigned long size_pages; |
4866 | 4983 | ||
4984 | /* | ||
4985 | * Find more memory for kernelcore in | ||
4986 | * [zone_movable_pfn[nid], zone_movable_limit[nid]). | ||
4987 | */ | ||
4867 | start_pfn = max(start_pfn, zone_movable_pfn[nid]); | 4988 | start_pfn = max(start_pfn, zone_movable_pfn[nid]); |
4868 | if (start_pfn >= end_pfn) | 4989 | if (start_pfn >= end_pfn) |
4869 | continue; | 4990 | continue; |
4870 | 4991 | ||
4992 | if (zone_movable_limit[nid]) { | ||
4993 | end_pfn = min(end_pfn, zone_movable_limit[nid]); | ||
4994 | /* No range left for kernelcore in this node */ | ||
4995 | if (start_pfn >= end_pfn) { | ||
4996 | zone_movable_pfn[nid] = | ||
4997 | zone_movable_limit[nid]; | ||
4998 | break; | ||
4999 | } | ||
5000 | } | ||
5001 | |||
4871 | /* Account for what is only usable for kernelcore */ | 5002 | /* Account for what is only usable for kernelcore */ |
4872 | if (start_pfn < usable_startpfn) { | 5003 | if (start_pfn < usable_startpfn) { |
4873 | unsigned long kernel_pages; | 5004 | unsigned long kernel_pages; |
@@ -4927,12 +5058,12 @@ restart: | |||
4927 | if (usable_nodes && required_kernelcore > usable_nodes) | 5058 | if (usable_nodes && required_kernelcore > usable_nodes) |
4928 | goto restart; | 5059 | goto restart; |
4929 | 5060 | ||
5061 | out: | ||
4930 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ | 5062 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ |
4931 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 5063 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
4932 | zone_movable_pfn[nid] = | 5064 | zone_movable_pfn[nid] = |
4933 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); | 5065 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); |
4934 | 5066 | ||
4935 | out: | ||
4936 | /* restore the node_state */ | 5067 | /* restore the node_state */ |
4937 | node_states[N_MEMORY] = saved_node_state; | 5068 | node_states[N_MEMORY] = saved_node_state; |
4938 | } | 5069 | } |
@@ -4995,6 +5126,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4995 | 5126 | ||
4996 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ | 5127 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ |
4997 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); | 5128 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); |
5129 | find_usable_zone_for_movable(); | ||
5130 | sanitize_zone_movable_limit(); | ||
4998 | find_zone_movable_pfns_for_nodes(); | 5131 | find_zone_movable_pfns_for_nodes(); |
4999 | 5132 | ||
5000 | /* Print out the zone ranges */ | 5133 | /* Print out the zone ranges */ |
@@ -5078,6 +5211,181 @@ static int __init cmdline_parse_movablecore(char *p) | |||
5078 | early_param("kernelcore", cmdline_parse_kernelcore); | 5211 | early_param("kernelcore", cmdline_parse_kernelcore); |
5079 | early_param("movablecore", cmdline_parse_movablecore); | 5212 | early_param("movablecore", cmdline_parse_movablecore); |
5080 | 5213 | ||
5214 | /** | ||
5215 | * movablemem_map_overlap() - Check if a range overlaps movablemem_map.map[]. | ||
5216 | * @start_pfn: start pfn of the range to be checked | ||
5217 | * @end_pfn: end pfn of the range to be checked (exclusive) | ||
5218 | * | ||
5219 | * This function checks if a given memory range [start_pfn, end_pfn) overlaps | ||
5220 | * the movablemem_map.map[] array. | ||
5221 | * | ||
5222 | * Return: index of the first overlapped element in movablemem_map.map[] | ||
5223 | * or -1 if they don't overlap each other. | ||
5224 | */ | ||
5225 | int __init movablemem_map_overlap(unsigned long start_pfn, | ||
5226 | unsigned long end_pfn) | ||
5227 | { | ||
5228 | int overlap; | ||
5229 | |||
5230 | if (!movablemem_map.nr_map) | ||
5231 | return -1; | ||
5232 | |||
5233 | for (overlap = 0; overlap < movablemem_map.nr_map; overlap++) | ||
5234 | if (start_pfn < movablemem_map.map[overlap].end_pfn) | ||
5235 | break; | ||
5236 | |||
5237 | if (overlap == movablemem_map.nr_map || | ||
5238 | end_pfn <= movablemem_map.map[overlap].start_pfn) | ||
5239 | return -1; | ||
5240 | |||
5241 | return overlap; | ||
5242 | } | ||
5243 | |||
5244 | /** | ||
5245 | * insert_movablemem_map - Insert a memory range in to movablemem_map.map. | ||
5246 | * @start_pfn: start pfn of the range | ||
5247 | * @end_pfn: end pfn of the range | ||
5248 | * | ||
5249 | * This function will also merge the overlapped ranges, and sort the array | ||
5250 | * by start_pfn in monotonic increasing order. | ||
5251 | */ | ||
5252 | void __init insert_movablemem_map(unsigned long start_pfn, | ||
5253 | unsigned long end_pfn) | ||
5254 | { | ||
5255 | int pos, overlap; | ||
5256 | |||
5257 | /* | ||
5258 | * pos will be at the 1st overlapped range, or the position | ||
5259 | * where the element should be inserted. | ||
5260 | */ | ||
5261 | for (pos = 0; pos < movablemem_map.nr_map; pos++) | ||
5262 | if (start_pfn <= movablemem_map.map[pos].end_pfn) | ||
5263 | break; | ||
5264 | |||
5265 | /* If there is no overlapped range, just insert the element. */ | ||
5266 | if (pos == movablemem_map.nr_map || | ||
5267 | end_pfn < movablemem_map.map[pos].start_pfn) { | ||
5268 | /* | ||
5269 | * If pos is not the end of array, we need to move all | ||
5270 | * the rest elements backward. | ||
5271 | */ | ||
5272 | if (pos < movablemem_map.nr_map) | ||
5273 | memmove(&movablemem_map.map[pos+1], | ||
5274 | &movablemem_map.map[pos], | ||
5275 | sizeof(struct movablemem_entry) * | ||
5276 | (movablemem_map.nr_map - pos)); | ||
5277 | movablemem_map.map[pos].start_pfn = start_pfn; | ||
5278 | movablemem_map.map[pos].end_pfn = end_pfn; | ||
5279 | movablemem_map.nr_map++; | ||
5280 | return; | ||
5281 | } | ||
5282 | |||
5283 | /* overlap will be at the last overlapped range */ | ||
5284 | for (overlap = pos + 1; overlap < movablemem_map.nr_map; overlap++) | ||
5285 | if (end_pfn < movablemem_map.map[overlap].start_pfn) | ||
5286 | break; | ||
5287 | |||
5288 | /* | ||
5289 | * If there are more ranges overlapped, we need to merge them, | ||
5290 | * and move the rest elements forward. | ||
5291 | */ | ||
5292 | overlap--; | ||
5293 | movablemem_map.map[pos].start_pfn = min(start_pfn, | ||
5294 | movablemem_map.map[pos].start_pfn); | ||
5295 | movablemem_map.map[pos].end_pfn = max(end_pfn, | ||
5296 | movablemem_map.map[overlap].end_pfn); | ||
5297 | |||
5298 | if (pos != overlap && overlap + 1 != movablemem_map.nr_map) | ||
5299 | memmove(&movablemem_map.map[pos+1], | ||
5300 | &movablemem_map.map[overlap+1], | ||
5301 | sizeof(struct movablemem_entry) * | ||
5302 | (movablemem_map.nr_map - overlap - 1)); | ||
5303 | |||
5304 | movablemem_map.nr_map -= overlap - pos; | ||
5305 | } | ||
5306 | |||
5307 | /** | ||
5308 | * movablemem_map_add_region - Add a memory range into movablemem_map. | ||
5309 | * @start: physical start address of range | ||
5310 | * @end: physical end address of range | ||
5311 | * | ||
5312 | * This function transform the physical address into pfn, and then add the | ||
5313 | * range into movablemem_map by calling insert_movablemem_map(). | ||
5314 | */ | ||
5315 | static void __init movablemem_map_add_region(u64 start, u64 size) | ||
5316 | { | ||
5317 | unsigned long start_pfn, end_pfn; | ||
5318 | |||
5319 | /* In case size == 0 or start + size overflows */ | ||
5320 | if (start + size <= start) | ||
5321 | return; | ||
5322 | |||
5323 | if (movablemem_map.nr_map >= ARRAY_SIZE(movablemem_map.map)) { | ||
5324 | pr_err("movablemem_map: too many entries;" | ||
5325 | " ignoring [mem %#010llx-%#010llx]\n", | ||
5326 | (unsigned long long) start, | ||
5327 | (unsigned long long) (start + size - 1)); | ||
5328 | return; | ||
5329 | } | ||
5330 | |||
5331 | start_pfn = PFN_DOWN(start); | ||
5332 | end_pfn = PFN_UP(start + size); | ||
5333 | insert_movablemem_map(start_pfn, end_pfn); | ||
5334 | } | ||
5335 | |||
5336 | /* | ||
5337 | * cmdline_parse_movablemem_map - Parse boot option movablemem_map. | ||
5338 | * @p: The boot option of the following format: | ||
5339 | * movablemem_map=nn[KMG]@ss[KMG] | ||
5340 | * | ||
5341 | * This option sets the memory range [ss, ss+nn) to be used as movable memory. | ||
5342 | * | ||
5343 | * Return: 0 on success or -EINVAL on failure. | ||
5344 | */ | ||
5345 | static int __init cmdline_parse_movablemem_map(char *p) | ||
5346 | { | ||
5347 | char *oldp; | ||
5348 | u64 start_at, mem_size; | ||
5349 | |||
5350 | if (!p) | ||
5351 | goto err; | ||
5352 | |||
5353 | if (!strcmp(p, "acpi")) | ||
5354 | movablemem_map.acpi = true; | ||
5355 | |||
5356 | /* | ||
5357 | * If user decide to use info from BIOS, all the other user specified | ||
5358 | * ranges will be ingored. | ||
5359 | */ | ||
5360 | if (movablemem_map.acpi) { | ||
5361 | if (movablemem_map.nr_map) { | ||
5362 | memset(movablemem_map.map, 0, | ||
5363 | sizeof(struct movablemem_entry) | ||
5364 | * movablemem_map.nr_map); | ||
5365 | movablemem_map.nr_map = 0; | ||
5366 | } | ||
5367 | return 0; | ||
5368 | } | ||
5369 | |||
5370 | oldp = p; | ||
5371 | mem_size = memparse(p, &p); | ||
5372 | if (p == oldp) | ||
5373 | goto err; | ||
5374 | |||
5375 | if (*p == '@') { | ||
5376 | oldp = ++p; | ||
5377 | start_at = memparse(p, &p); | ||
5378 | if (p == oldp || *p != '\0') | ||
5379 | goto err; | ||
5380 | |||
5381 | movablemem_map_add_region(start_at, mem_size); | ||
5382 | return 0; | ||
5383 | } | ||
5384 | err: | ||
5385 | return -EINVAL; | ||
5386 | } | ||
5387 | early_param("movablemem_map", cmdline_parse_movablemem_map); | ||
5388 | |||
5081 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 5389 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
5082 | 5390 | ||
5083 | /** | 5391 | /** |
@@ -5160,8 +5468,8 @@ static void calculate_totalreserve_pages(void) | |||
5160 | /* we treat the high watermark as reserved pages. */ | 5468 | /* we treat the high watermark as reserved pages. */ |
5161 | max += high_wmark_pages(zone); | 5469 | max += high_wmark_pages(zone); |
5162 | 5470 | ||
5163 | if (max > zone->present_pages) | 5471 | if (max > zone->managed_pages) |
5164 | max = zone->present_pages; | 5472 | max = zone->managed_pages; |
5165 | reserve_pages += max; | 5473 | reserve_pages += max; |
5166 | /* | 5474 | /* |
5167 | * Lowmem reserves are not available to | 5475 | * Lowmem reserves are not available to |
@@ -5193,7 +5501,7 @@ static void setup_per_zone_lowmem_reserve(void) | |||
5193 | for_each_online_pgdat(pgdat) { | 5501 | for_each_online_pgdat(pgdat) { |
5194 | for (j = 0; j < MAX_NR_ZONES; j++) { | 5502 | for (j = 0; j < MAX_NR_ZONES; j++) { |
5195 | struct zone *zone = pgdat->node_zones + j; | 5503 | struct zone *zone = pgdat->node_zones + j; |
5196 | unsigned long present_pages = zone->present_pages; | 5504 | unsigned long managed_pages = zone->managed_pages; |
5197 | 5505 | ||
5198 | zone->lowmem_reserve[j] = 0; | 5506 | zone->lowmem_reserve[j] = 0; |
5199 | 5507 | ||
@@ -5207,9 +5515,9 @@ static void setup_per_zone_lowmem_reserve(void) | |||
5207 | sysctl_lowmem_reserve_ratio[idx] = 1; | 5515 | sysctl_lowmem_reserve_ratio[idx] = 1; |
5208 | 5516 | ||
5209 | lower_zone = pgdat->node_zones + idx; | 5517 | lower_zone = pgdat->node_zones + idx; |
5210 | lower_zone->lowmem_reserve[j] = present_pages / | 5518 | lower_zone->lowmem_reserve[j] = managed_pages / |
5211 | sysctl_lowmem_reserve_ratio[idx]; | 5519 | sysctl_lowmem_reserve_ratio[idx]; |
5212 | present_pages += lower_zone->present_pages; | 5520 | managed_pages += lower_zone->managed_pages; |
5213 | } | 5521 | } |
5214 | } | 5522 | } |
5215 | } | 5523 | } |
@@ -5228,14 +5536,14 @@ static void __setup_per_zone_wmarks(void) | |||
5228 | /* Calculate total number of !ZONE_HIGHMEM pages */ | 5536 | /* Calculate total number of !ZONE_HIGHMEM pages */ |
5229 | for_each_zone(zone) { | 5537 | for_each_zone(zone) { |
5230 | if (!is_highmem(zone)) | 5538 | if (!is_highmem(zone)) |
5231 | lowmem_pages += zone->present_pages; | 5539 | lowmem_pages += zone->managed_pages; |
5232 | } | 5540 | } |
5233 | 5541 | ||
5234 | for_each_zone(zone) { | 5542 | for_each_zone(zone) { |
5235 | u64 tmp; | 5543 | u64 tmp; |
5236 | 5544 | ||
5237 | spin_lock_irqsave(&zone->lock, flags); | 5545 | spin_lock_irqsave(&zone->lock, flags); |
5238 | tmp = (u64)pages_min * zone->present_pages; | 5546 | tmp = (u64)pages_min * zone->managed_pages; |
5239 | do_div(tmp, lowmem_pages); | 5547 | do_div(tmp, lowmem_pages); |
5240 | if (is_highmem(zone)) { | 5548 | if (is_highmem(zone)) { |
5241 | /* | 5549 | /* |
@@ -5247,13 +5555,10 @@ static void __setup_per_zone_wmarks(void) | |||
5247 | * deltas controls asynch page reclaim, and so should | 5555 | * deltas controls asynch page reclaim, and so should |
5248 | * not be capped for highmem. | 5556 | * not be capped for highmem. |
5249 | */ | 5557 | */ |
5250 | int min_pages; | 5558 | unsigned long min_pages; |
5251 | 5559 | ||
5252 | min_pages = zone->present_pages / 1024; | 5560 | min_pages = zone->managed_pages / 1024; |
5253 | if (min_pages < SWAP_CLUSTER_MAX) | 5561 | min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); |
5254 | min_pages = SWAP_CLUSTER_MAX; | ||
5255 | if (min_pages > 128) | ||
5256 | min_pages = 128; | ||
5257 | zone->watermark[WMARK_MIN] = min_pages; | 5562 | zone->watermark[WMARK_MIN] = min_pages; |
5258 | } else { | 5563 | } else { |
5259 | /* | 5564 | /* |
@@ -5314,7 +5619,7 @@ static void __meminit calculate_zone_inactive_ratio(struct zone *zone) | |||
5314 | unsigned int gb, ratio; | 5619 | unsigned int gb, ratio; |
5315 | 5620 | ||
5316 | /* Zone size in gigabytes */ | 5621 | /* Zone size in gigabytes */ |
5317 | gb = zone->present_pages >> (30 - PAGE_SHIFT); | 5622 | gb = zone->managed_pages >> (30 - PAGE_SHIFT); |
5318 | if (gb) | 5623 | if (gb) |
5319 | ratio = int_sqrt(10 * gb); | 5624 | ratio = int_sqrt(10 * gb); |
5320 | else | 5625 | else |
@@ -5400,7 +5705,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | |||
5400 | return rc; | 5705 | return rc; |
5401 | 5706 | ||
5402 | for_each_zone(zone) | 5707 | for_each_zone(zone) |
5403 | zone->min_unmapped_pages = (zone->present_pages * | 5708 | zone->min_unmapped_pages = (zone->managed_pages * |
5404 | sysctl_min_unmapped_ratio) / 100; | 5709 | sysctl_min_unmapped_ratio) / 100; |
5405 | return 0; | 5710 | return 0; |
5406 | } | 5711 | } |
@@ -5416,7 +5721,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | |||
5416 | return rc; | 5721 | return rc; |
5417 | 5722 | ||
5418 | for_each_zone(zone) | 5723 | for_each_zone(zone) |
5419 | zone->min_slab_pages = (zone->present_pages * | 5724 | zone->min_slab_pages = (zone->managed_pages * |
5420 | sysctl_min_slab_ratio) / 100; | 5725 | sysctl_min_slab_ratio) / 100; |
5421 | return 0; | 5726 | return 0; |
5422 | } | 5727 | } |
@@ -5458,7 +5763,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
5458 | for_each_populated_zone(zone) { | 5763 | for_each_populated_zone(zone) { |
5459 | for_each_possible_cpu(cpu) { | 5764 | for_each_possible_cpu(cpu) { |
5460 | unsigned long high; | 5765 | unsigned long high; |
5461 | high = zone->present_pages / percpu_pagelist_fraction; | 5766 | high = zone->managed_pages / percpu_pagelist_fraction; |
5462 | setup_pagelist_highmark( | 5767 | setup_pagelist_highmark( |
5463 | per_cpu_ptr(zone->pageset, cpu), high); | 5768 | per_cpu_ptr(zone->pageset, cpu), high); |
5464 | } | 5769 | } |
@@ -5645,8 +5950,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5645 | pfn = page_to_pfn(page); | 5950 | pfn = page_to_pfn(page); |
5646 | bitmap = get_pageblock_bitmap(zone, pfn); | 5951 | bitmap = get_pageblock_bitmap(zone, pfn); |
5647 | bitidx = pfn_to_bitidx(zone, pfn); | 5952 | bitidx = pfn_to_bitidx(zone, pfn); |
5648 | VM_BUG_ON(pfn < zone->zone_start_pfn); | 5953 | VM_BUG_ON(!zone_spans_pfn(zone, pfn)); |
5649 | VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); | ||
5650 | 5954 | ||
5651 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | 5955 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) |
5652 | if (flags & value) | 5956 | if (flags & value) |
@@ -5744,8 +6048,7 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
5744 | 6048 | ||
5745 | zone = page_zone(page); | 6049 | zone = page_zone(page); |
5746 | pfn = page_to_pfn(page); | 6050 | pfn = page_to_pfn(page); |
5747 | if (zone->zone_start_pfn > pfn || | 6051 | if (!zone_spans_pfn(zone, pfn)) |
5748 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | ||
5749 | return false; | 6052 | return false; |
5750 | 6053 | ||
5751 | return !has_unmovable_pages(zone, page, 0, true); | 6054 | return !has_unmovable_pages(zone, page, 0, true); |
@@ -5801,14 +6104,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5801 | &cc->migratepages); | 6104 | &cc->migratepages); |
5802 | cc->nr_migratepages -= nr_reclaimed; | 6105 | cc->nr_migratepages -= nr_reclaimed; |
5803 | 6106 | ||
5804 | ret = migrate_pages(&cc->migratepages, | 6107 | ret = migrate_pages(&cc->migratepages, alloc_migrate_target, |
5805 | alloc_migrate_target, | 6108 | 0, MIGRATE_SYNC, MR_CMA); |
5806 | 0, false, MIGRATE_SYNC, | ||
5807 | MR_CMA); | ||
5808 | } | 6109 | } |
5809 | 6110 | if (ret < 0) { | |
5810 | putback_movable_pages(&cc->migratepages); | 6111 | putback_movable_pages(&cc->migratepages); |
5811 | return ret > 0 ? 0 : ret; | 6112 | return ret; |
6113 | } | ||
6114 | return 0; | ||
5812 | } | 6115 | } |
5813 | 6116 | ||
5814 | /** | 6117 | /** |
@@ -105,7 +105,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
105 | */ | 105 | */ |
106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { | 106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { |
107 | anon_vma_lock_write(anon_vma); | 107 | anon_vma_lock_write(anon_vma); |
108 | anon_vma_unlock(anon_vma); | 108 | anon_vma_unlock_write(anon_vma); |
109 | } | 109 | } |
110 | 110 | ||
111 | kmem_cache_free(anon_vma_cachep, anon_vma); | 111 | kmem_cache_free(anon_vma_cachep, anon_vma); |
@@ -191,7 +191,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
191 | avc = NULL; | 191 | avc = NULL; |
192 | } | 192 | } |
193 | spin_unlock(&mm->page_table_lock); | 193 | spin_unlock(&mm->page_table_lock); |
194 | anon_vma_unlock(anon_vma); | 194 | anon_vma_unlock_write(anon_vma); |
195 | 195 | ||
196 | if (unlikely(allocated)) | 196 | if (unlikely(allocated)) |
197 | put_anon_vma(allocated); | 197 | put_anon_vma(allocated); |
@@ -308,7 +308,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
308 | vma->anon_vma = anon_vma; | 308 | vma->anon_vma = anon_vma; |
309 | anon_vma_lock_write(anon_vma); | 309 | anon_vma_lock_write(anon_vma); |
310 | anon_vma_chain_link(vma, avc, anon_vma); | 310 | anon_vma_chain_link(vma, avc, anon_vma); |
311 | anon_vma_unlock(anon_vma); | 311 | anon_vma_unlock_write(anon_vma); |
312 | 312 | ||
313 | return 0; | 313 | return 0; |
314 | 314 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 5dd56f6efdbd..1ad79243cb7b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -335,19 +335,19 @@ static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, | |||
335 | pgoff_t start, unsigned int nr_pages, | 335 | pgoff_t start, unsigned int nr_pages, |
336 | struct page **pages, pgoff_t *indices) | 336 | struct page **pages, pgoff_t *indices) |
337 | { | 337 | { |
338 | unsigned int i; | 338 | void **slot; |
339 | unsigned int ret; | 339 | unsigned int ret = 0; |
340 | unsigned int nr_found; | 340 | struct radix_tree_iter iter; |
341 | |||
342 | if (!nr_pages) | ||
343 | return 0; | ||
341 | 344 | ||
342 | rcu_read_lock(); | 345 | rcu_read_lock(); |
343 | restart: | 346 | restart: |
344 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | 347 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { |
345 | (void ***)pages, indices, start, nr_pages); | ||
346 | ret = 0; | ||
347 | for (i = 0; i < nr_found; i++) { | ||
348 | struct page *page; | 348 | struct page *page; |
349 | repeat: | 349 | repeat: |
350 | page = radix_tree_deref_slot((void **)pages[i]); | 350 | page = radix_tree_deref_slot(slot); |
351 | if (unlikely(!page)) | 351 | if (unlikely(!page)) |
352 | continue; | 352 | continue; |
353 | if (radix_tree_exception(page)) { | 353 | if (radix_tree_exception(page)) { |
@@ -364,17 +364,16 @@ repeat: | |||
364 | goto repeat; | 364 | goto repeat; |
365 | 365 | ||
366 | /* Has the page moved? */ | 366 | /* Has the page moved? */ |
367 | if (unlikely(page != *((void **)pages[i]))) { | 367 | if (unlikely(page != *slot)) { |
368 | page_cache_release(page); | 368 | page_cache_release(page); |
369 | goto repeat; | 369 | goto repeat; |
370 | } | 370 | } |
371 | export: | 371 | export: |
372 | indices[ret] = indices[i]; | 372 | indices[ret] = iter.index; |
373 | pages[ret] = page; | 373 | pages[ret] = page; |
374 | ret++; | 374 | if (++ret == nr_pages) |
375 | break; | ||
375 | } | 376 | } |
376 | if (unlikely(!ret && nr_found)) | ||
377 | goto restart; | ||
378 | rcu_read_unlock(); | 377 | rcu_read_unlock(); |
379 | return ret; | 378 | return ret; |
380 | } | 379 | } |
@@ -2386,6 +2385,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
2386 | bool remount) | 2385 | bool remount) |
2387 | { | 2386 | { |
2388 | char *this_char, *value, *rest; | 2387 | char *this_char, *value, *rest; |
2388 | struct mempolicy *mpol = NULL; | ||
2389 | uid_t uid; | 2389 | uid_t uid; |
2390 | gid_t gid; | 2390 | gid_t gid; |
2391 | 2391 | ||
@@ -2414,7 +2414,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
2414 | printk(KERN_ERR | 2414 | printk(KERN_ERR |
2415 | "tmpfs: No value for mount option '%s'\n", | 2415 | "tmpfs: No value for mount option '%s'\n", |
2416 | this_char); | 2416 | this_char); |
2417 | return 1; | 2417 | goto error; |
2418 | } | 2418 | } |
2419 | 2419 | ||
2420 | if (!strcmp(this_char,"size")) { | 2420 | if (!strcmp(this_char,"size")) { |
@@ -2463,19 +2463,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
2463 | if (!gid_valid(sbinfo->gid)) | 2463 | if (!gid_valid(sbinfo->gid)) |
2464 | goto bad_val; | 2464 | goto bad_val; |
2465 | } else if (!strcmp(this_char,"mpol")) { | 2465 | } else if (!strcmp(this_char,"mpol")) { |
2466 | if (mpol_parse_str(value, &sbinfo->mpol)) | 2466 | mpol_put(mpol); |
2467 | mpol = NULL; | ||
2468 | if (mpol_parse_str(value, &mpol)) | ||
2467 | goto bad_val; | 2469 | goto bad_val; |
2468 | } else { | 2470 | } else { |
2469 | printk(KERN_ERR "tmpfs: Bad mount option %s\n", | 2471 | printk(KERN_ERR "tmpfs: Bad mount option %s\n", |
2470 | this_char); | 2472 | this_char); |
2471 | return 1; | 2473 | goto error; |
2472 | } | 2474 | } |
2473 | } | 2475 | } |
2476 | sbinfo->mpol = mpol; | ||
2474 | return 0; | 2477 | return 0; |
2475 | 2478 | ||
2476 | bad_val: | 2479 | bad_val: |
2477 | printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", | 2480 | printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", |
2478 | value, this_char); | 2481 | value, this_char); |
2482 | error: | ||
2483 | mpol_put(mpol); | ||
2479 | return 1; | 2484 | return 1; |
2480 | 2485 | ||
2481 | } | 2486 | } |
@@ -2487,6 +2492,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | |||
2487 | unsigned long inodes; | 2492 | unsigned long inodes; |
2488 | int error = -EINVAL; | 2493 | int error = -EINVAL; |
2489 | 2494 | ||
2495 | config.mpol = NULL; | ||
2490 | if (shmem_parse_options(data, &config, true)) | 2496 | if (shmem_parse_options(data, &config, true)) |
2491 | return error; | 2497 | return error; |
2492 | 2498 | ||
@@ -2511,8 +2517,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | |||
2511 | sbinfo->max_inodes = config.max_inodes; | 2517 | sbinfo->max_inodes = config.max_inodes; |
2512 | sbinfo->free_inodes = config.max_inodes - inodes; | 2518 | sbinfo->free_inodes = config.max_inodes - inodes; |
2513 | 2519 | ||
2514 | mpol_put(sbinfo->mpol); | 2520 | /* |
2515 | sbinfo->mpol = config.mpol; /* transfers initial ref */ | 2521 | * Preserve previous mempolicy unless mpol remount option was specified. |
2522 | */ | ||
2523 | if (config.mpol) { | ||
2524 | mpol_put(sbinfo->mpol); | ||
2525 | sbinfo->mpol = config.mpol; /* transfers initial ref */ | ||
2526 | } | ||
2516 | out: | 2527 | out: |
2517 | spin_unlock(&sbinfo->stat_lock); | 2528 | spin_unlock(&sbinfo->stat_lock); |
2518 | return error; | 2529 | return error; |
@@ -2545,6 +2556,7 @@ static void shmem_put_super(struct super_block *sb) | |||
2545 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | 2556 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
2546 | 2557 | ||
2547 | percpu_counter_destroy(&sbinfo->used_blocks); | 2558 | percpu_counter_destroy(&sbinfo->used_blocks); |
2559 | mpol_put(sbinfo->mpol); | ||
2548 | kfree(sbinfo); | 2560 | kfree(sbinfo); |
2549 | sb->s_fs_info = NULL; | 2561 | sb->s_fs_info = NULL; |
2550 | } | 2562 | } |
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size) | |||
360 | clear_slob_page_free(sp); | 360 | clear_slob_page_free(sp); |
361 | spin_unlock_irqrestore(&slob_lock, flags); | 361 | spin_unlock_irqrestore(&slob_lock, flags); |
362 | __ClearPageSlab(sp); | 362 | __ClearPageSlab(sp); |
363 | reset_page_mapcount(sp); | 363 | page_mapcount_reset(sp); |
364 | slob_free_pages(b, 0); | 364 | slob_free_pages(b, 0); |
365 | return; | 365 | return; |
366 | } | 366 | } |
@@ -1408,7 +1408,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
1408 | __ClearPageSlab(page); | 1408 | __ClearPageSlab(page); |
1409 | 1409 | ||
1410 | memcg_release_pages(s, order); | 1410 | memcg_release_pages(s, order); |
1411 | reset_page_mapcount(page); | 1411 | page_mapcount_reset(page); |
1412 | if (current->reclaim_state) | 1412 | if (current->reclaim_state) |
1413 | current->reclaim_state->reclaimed_slab += pages; | 1413 | current->reclaim_state->reclaimed_slab += pages; |
1414 | __free_memcg_kmem_pages(page, order); | 1414 | __free_memcg_kmem_pages(page, order); |
diff --git a/mm/sparse.c b/mm/sparse.c index 6b5fb762e2ca..7ca6dc847947 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -615,10 +615,11 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, | |||
615 | } | 615 | } |
616 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | 616 | static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) |
617 | { | 617 | { |
618 | return; /* XXX: Not implemented yet */ | 618 | vmemmap_free(memmap, nr_pages); |
619 | } | 619 | } |
620 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) | 620 | static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) |
621 | { | 621 | { |
622 | vmemmap_free(memmap, nr_pages); | ||
622 | } | 623 | } |
623 | #else | 624 | #else |
624 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | 625 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) |
@@ -697,7 +698,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) | |||
697 | /* | 698 | /* |
698 | * Check to see if allocation came from hot-plug-add | 699 | * Check to see if allocation came from hot-plug-add |
699 | */ | 700 | */ |
700 | if (PageSlab(usemap_page)) { | 701 | if (PageSlab(usemap_page) || PageCompound(usemap_page)) { |
701 | kfree(usemap); | 702 | kfree(usemap); |
702 | if (memmap) | 703 | if (memmap) |
703 | __kfree_section_memmap(memmap, PAGES_PER_SECTION); | 704 | __kfree_section_memmap(memmap, PAGES_PER_SECTION); |
@@ -782,7 +783,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
782 | 783 | ||
783 | for (i = 0; i < PAGES_PER_SECTION; i++) { | 784 | for (i = 0; i < PAGES_PER_SECTION; i++) { |
784 | if (PageHWPoison(&memmap[i])) { | 785 | if (PageHWPoison(&memmap[i])) { |
785 | atomic_long_sub(1, &mce_bad_pages); | 786 | atomic_long_sub(1, &num_poisoned_pages); |
786 | ClearPageHWPoison(&memmap[i]); | 787 | ClearPageHWPoison(&memmap[i]); |
787 | } | 788 | } |
788 | } | 789 | } |
@@ -796,8 +797,10 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
796 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | 797 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) |
797 | { | 798 | { |
798 | struct page *memmap = NULL; | 799 | struct page *memmap = NULL; |
799 | unsigned long *usemap = NULL; | 800 | unsigned long *usemap = NULL, flags; |
801 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
800 | 802 | ||
803 | pgdat_resize_lock(pgdat, &flags); | ||
801 | if (ms->section_mem_map) { | 804 | if (ms->section_mem_map) { |
802 | usemap = ms->pageblock_flags; | 805 | usemap = ms->pageblock_flags; |
803 | memmap = sparse_decode_mem_map(ms->section_mem_map, | 806 | memmap = sparse_decode_mem_map(ms->section_mem_map, |
@@ -805,6 +808,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | |||
805 | ms->section_mem_map = 0; | 808 | ms->section_mem_map = 0; |
806 | ms->pageblock_flags = NULL; | 809 | ms->pageblock_flags = NULL; |
807 | } | 810 | } |
811 | pgdat_resize_unlock(pgdat, &flags); | ||
808 | 812 | ||
809 | clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); | 813 | clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); |
810 | free_section_usemap(memmap, usemap); | 814 | free_section_usemap(memmap, usemap); |
@@ -855,9 +855,14 @@ EXPORT_SYMBOL(pagevec_lookup_tag); | |||
855 | void __init swap_setup(void) | 855 | void __init swap_setup(void) |
856 | { | 856 | { |
857 | unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); | 857 | unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); |
858 | |||
859 | #ifdef CONFIG_SWAP | 858 | #ifdef CONFIG_SWAP |
860 | bdi_init(swapper_space.backing_dev_info); | 859 | int i; |
860 | |||
861 | bdi_init(swapper_spaces[0].backing_dev_info); | ||
862 | for (i = 0; i < MAX_SWAPFILES; i++) { | ||
863 | spin_lock_init(&swapper_spaces[i].tree_lock); | ||
864 | INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); | ||
865 | } | ||
861 | #endif | 866 | #endif |
862 | 867 | ||
863 | /* Use a smaller cluster for small-memory machines */ | 868 | /* Use a smaller cluster for small-memory machines */ |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 0cb36fb1f61c..7efcf1525921 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -36,12 +36,12 @@ static struct backing_dev_info swap_backing_dev_info = { | |||
36 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, | 36 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
37 | }; | 37 | }; |
38 | 38 | ||
39 | struct address_space swapper_space = { | 39 | struct address_space swapper_spaces[MAX_SWAPFILES] = { |
40 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | 40 | [0 ... MAX_SWAPFILES - 1] = { |
41 | .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), | 41 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), |
42 | .a_ops = &swap_aops, | 42 | .a_ops = &swap_aops, |
43 | .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), | 43 | .backing_dev_info = &swap_backing_dev_info, |
44 | .backing_dev_info = &swap_backing_dev_info, | 44 | } |
45 | }; | 45 | }; |
46 | 46 | ||
47 | #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) | 47 | #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) |
@@ -53,13 +53,24 @@ static struct { | |||
53 | unsigned long find_total; | 53 | unsigned long find_total; |
54 | } swap_cache_info; | 54 | } swap_cache_info; |
55 | 55 | ||
56 | unsigned long total_swapcache_pages(void) | ||
57 | { | ||
58 | int i; | ||
59 | unsigned long ret = 0; | ||
60 | |||
61 | for (i = 0; i < MAX_SWAPFILES; i++) | ||
62 | ret += swapper_spaces[i].nrpages; | ||
63 | return ret; | ||
64 | } | ||
65 | |||
56 | void show_swap_cache_info(void) | 66 | void show_swap_cache_info(void) |
57 | { | 67 | { |
58 | printk("%lu pages in swap cache\n", total_swapcache_pages); | 68 | printk("%lu pages in swap cache\n", total_swapcache_pages()); |
59 | printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", | 69 | printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", |
60 | swap_cache_info.add_total, swap_cache_info.del_total, | 70 | swap_cache_info.add_total, swap_cache_info.del_total, |
61 | swap_cache_info.find_success, swap_cache_info.find_total); | 71 | swap_cache_info.find_success, swap_cache_info.find_total); |
62 | printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); | 72 | printk("Free swap = %ldkB\n", |
73 | get_nr_swap_pages() << (PAGE_SHIFT - 10)); | ||
63 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); | 74 | printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); |
64 | } | 75 | } |
65 | 76 | ||
@@ -70,6 +81,7 @@ void show_swap_cache_info(void) | |||
70 | static int __add_to_swap_cache(struct page *page, swp_entry_t entry) | 81 | static int __add_to_swap_cache(struct page *page, swp_entry_t entry) |
71 | { | 82 | { |
72 | int error; | 83 | int error; |
84 | struct address_space *address_space; | ||
73 | 85 | ||
74 | VM_BUG_ON(!PageLocked(page)); | 86 | VM_BUG_ON(!PageLocked(page)); |
75 | VM_BUG_ON(PageSwapCache(page)); | 87 | VM_BUG_ON(PageSwapCache(page)); |
@@ -79,14 +91,16 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry) | |||
79 | SetPageSwapCache(page); | 91 | SetPageSwapCache(page); |
80 | set_page_private(page, entry.val); | 92 | set_page_private(page, entry.val); |
81 | 93 | ||
82 | spin_lock_irq(&swapper_space.tree_lock); | 94 | address_space = swap_address_space(entry); |
83 | error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); | 95 | spin_lock_irq(&address_space->tree_lock); |
96 | error = radix_tree_insert(&address_space->page_tree, | ||
97 | entry.val, page); | ||
84 | if (likely(!error)) { | 98 | if (likely(!error)) { |
85 | total_swapcache_pages++; | 99 | address_space->nrpages++; |
86 | __inc_zone_page_state(page, NR_FILE_PAGES); | 100 | __inc_zone_page_state(page, NR_FILE_PAGES); |
87 | INC_CACHE_INFO(add_total); | 101 | INC_CACHE_INFO(add_total); |
88 | } | 102 | } |
89 | spin_unlock_irq(&swapper_space.tree_lock); | 103 | spin_unlock_irq(&address_space->tree_lock); |
90 | 104 | ||
91 | if (unlikely(error)) { | 105 | if (unlikely(error)) { |
92 | /* | 106 | /* |
@@ -122,14 +136,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
122 | */ | 136 | */ |
123 | void __delete_from_swap_cache(struct page *page) | 137 | void __delete_from_swap_cache(struct page *page) |
124 | { | 138 | { |
139 | swp_entry_t entry; | ||
140 | struct address_space *address_space; | ||
141 | |||
125 | VM_BUG_ON(!PageLocked(page)); | 142 | VM_BUG_ON(!PageLocked(page)); |
126 | VM_BUG_ON(!PageSwapCache(page)); | 143 | VM_BUG_ON(!PageSwapCache(page)); |
127 | VM_BUG_ON(PageWriteback(page)); | 144 | VM_BUG_ON(PageWriteback(page)); |
128 | 145 | ||
129 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); | 146 | entry.val = page_private(page); |
147 | address_space = swap_address_space(entry); | ||
148 | radix_tree_delete(&address_space->page_tree, page_private(page)); | ||
130 | set_page_private(page, 0); | 149 | set_page_private(page, 0); |
131 | ClearPageSwapCache(page); | 150 | ClearPageSwapCache(page); |
132 | total_swapcache_pages--; | 151 | address_space->nrpages--; |
133 | __dec_zone_page_state(page, NR_FILE_PAGES); | 152 | __dec_zone_page_state(page, NR_FILE_PAGES); |
134 | INC_CACHE_INFO(del_total); | 153 | INC_CACHE_INFO(del_total); |
135 | } | 154 | } |
@@ -195,12 +214,14 @@ int add_to_swap(struct page *page) | |||
195 | void delete_from_swap_cache(struct page *page) | 214 | void delete_from_swap_cache(struct page *page) |
196 | { | 215 | { |
197 | swp_entry_t entry; | 216 | swp_entry_t entry; |
217 | struct address_space *address_space; | ||
198 | 218 | ||
199 | entry.val = page_private(page); | 219 | entry.val = page_private(page); |
200 | 220 | ||
201 | spin_lock_irq(&swapper_space.tree_lock); | 221 | address_space = swap_address_space(entry); |
222 | spin_lock_irq(&address_space->tree_lock); | ||
202 | __delete_from_swap_cache(page); | 223 | __delete_from_swap_cache(page); |
203 | spin_unlock_irq(&swapper_space.tree_lock); | 224 | spin_unlock_irq(&address_space->tree_lock); |
204 | 225 | ||
205 | swapcache_free(entry, page); | 226 | swapcache_free(entry, page); |
206 | page_cache_release(page); | 227 | page_cache_release(page); |
@@ -263,7 +284,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) | |||
263 | { | 284 | { |
264 | struct page *page; | 285 | struct page *page; |
265 | 286 | ||
266 | page = find_get_page(&swapper_space, entry.val); | 287 | page = find_get_page(swap_address_space(entry), entry.val); |
267 | 288 | ||
268 | if (page) | 289 | if (page) |
269 | INC_CACHE_INFO(find_success); | 290 | INC_CACHE_INFO(find_success); |
@@ -290,7 +311,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
290 | * called after lookup_swap_cache() failed, re-calling | 311 | * called after lookup_swap_cache() failed, re-calling |
291 | * that would confuse statistics. | 312 | * that would confuse statistics. |
292 | */ | 313 | */ |
293 | found_page = find_get_page(&swapper_space, entry.val); | 314 | found_page = find_get_page(swap_address_space(entry), |
315 | entry.val); | ||
294 | if (found_page) | 316 | if (found_page) |
295 | break; | 317 | break; |
296 | 318 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index e97a0e5aea91..c72c648f750c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -47,9 +47,11 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**); | |||
47 | 47 | ||
48 | DEFINE_SPINLOCK(swap_lock); | 48 | DEFINE_SPINLOCK(swap_lock); |
49 | static unsigned int nr_swapfiles; | 49 | static unsigned int nr_swapfiles; |
50 | long nr_swap_pages; | 50 | atomic_long_t nr_swap_pages; |
51 | /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ | ||
51 | long total_swap_pages; | 52 | long total_swap_pages; |
52 | static int least_priority; | 53 | static int least_priority; |
54 | static atomic_t highest_priority_index = ATOMIC_INIT(-1); | ||
53 | 55 | ||
54 | static const char Bad_file[] = "Bad swap file entry "; | 56 | static const char Bad_file[] = "Bad swap file entry "; |
55 | static const char Unused_file[] = "Unused swap file entry "; | 57 | static const char Unused_file[] = "Unused swap file entry "; |
@@ -79,7 +81,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | |||
79 | struct page *page; | 81 | struct page *page; |
80 | int ret = 0; | 82 | int ret = 0; |
81 | 83 | ||
82 | page = find_get_page(&swapper_space, entry.val); | 84 | page = find_get_page(swap_address_space(entry), entry.val); |
83 | if (!page) | 85 | if (!page) |
84 | return 0; | 86 | return 0; |
85 | /* | 87 | /* |
@@ -223,7 +225,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
223 | si->lowest_alloc = si->max; | 225 | si->lowest_alloc = si->max; |
224 | si->highest_alloc = 0; | 226 | si->highest_alloc = 0; |
225 | } | 227 | } |
226 | spin_unlock(&swap_lock); | 228 | spin_unlock(&si->lock); |
227 | 229 | ||
228 | /* | 230 | /* |
229 | * If seek is expensive, start searching for new cluster from | 231 | * If seek is expensive, start searching for new cluster from |
@@ -242,7 +244,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
242 | if (si->swap_map[offset]) | 244 | if (si->swap_map[offset]) |
243 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 245 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
244 | else if (offset == last_in_cluster) { | 246 | else if (offset == last_in_cluster) { |
245 | spin_lock(&swap_lock); | 247 | spin_lock(&si->lock); |
246 | offset -= SWAPFILE_CLUSTER - 1; | 248 | offset -= SWAPFILE_CLUSTER - 1; |
247 | si->cluster_next = offset; | 249 | si->cluster_next = offset; |
248 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 250 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
@@ -263,7 +265,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
263 | if (si->swap_map[offset]) | 265 | if (si->swap_map[offset]) |
264 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 266 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
265 | else if (offset == last_in_cluster) { | 267 | else if (offset == last_in_cluster) { |
266 | spin_lock(&swap_lock); | 268 | spin_lock(&si->lock); |
267 | offset -= SWAPFILE_CLUSTER - 1; | 269 | offset -= SWAPFILE_CLUSTER - 1; |
268 | si->cluster_next = offset; | 270 | si->cluster_next = offset; |
269 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 271 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
@@ -277,7 +279,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
277 | } | 279 | } |
278 | 280 | ||
279 | offset = scan_base; | 281 | offset = scan_base; |
280 | spin_lock(&swap_lock); | 282 | spin_lock(&si->lock); |
281 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 283 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
282 | si->lowest_alloc = 0; | 284 | si->lowest_alloc = 0; |
283 | } | 285 | } |
@@ -293,9 +295,9 @@ checks: | |||
293 | /* reuse swap entry of cache-only swap if not busy. */ | 295 | /* reuse swap entry of cache-only swap if not busy. */ |
294 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 296 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
295 | int swap_was_freed; | 297 | int swap_was_freed; |
296 | spin_unlock(&swap_lock); | 298 | spin_unlock(&si->lock); |
297 | swap_was_freed = __try_to_reclaim_swap(si, offset); | 299 | swap_was_freed = __try_to_reclaim_swap(si, offset); |
298 | spin_lock(&swap_lock); | 300 | spin_lock(&si->lock); |
299 | /* entry was freed successfully, try to use this again */ | 301 | /* entry was freed successfully, try to use this again */ |
300 | if (swap_was_freed) | 302 | if (swap_was_freed) |
301 | goto checks; | 303 | goto checks; |
@@ -335,13 +337,13 @@ checks: | |||
335 | si->lowest_alloc <= last_in_cluster) | 337 | si->lowest_alloc <= last_in_cluster) |
336 | last_in_cluster = si->lowest_alloc - 1; | 338 | last_in_cluster = si->lowest_alloc - 1; |
337 | si->flags |= SWP_DISCARDING; | 339 | si->flags |= SWP_DISCARDING; |
338 | spin_unlock(&swap_lock); | 340 | spin_unlock(&si->lock); |
339 | 341 | ||
340 | if (offset < last_in_cluster) | 342 | if (offset < last_in_cluster) |
341 | discard_swap_cluster(si, offset, | 343 | discard_swap_cluster(si, offset, |
342 | last_in_cluster - offset + 1); | 344 | last_in_cluster - offset + 1); |
343 | 345 | ||
344 | spin_lock(&swap_lock); | 346 | spin_lock(&si->lock); |
345 | si->lowest_alloc = 0; | 347 | si->lowest_alloc = 0; |
346 | si->flags &= ~SWP_DISCARDING; | 348 | si->flags &= ~SWP_DISCARDING; |
347 | 349 | ||
@@ -355,10 +357,10 @@ checks: | |||
355 | * could defer that delay until swap_writepage, | 357 | * could defer that delay until swap_writepage, |
356 | * but it's easier to keep this self-contained. | 358 | * but it's easier to keep this self-contained. |
357 | */ | 359 | */ |
358 | spin_unlock(&swap_lock); | 360 | spin_unlock(&si->lock); |
359 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), | 361 | wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), |
360 | wait_for_discard, TASK_UNINTERRUPTIBLE); | 362 | wait_for_discard, TASK_UNINTERRUPTIBLE); |
361 | spin_lock(&swap_lock); | 363 | spin_lock(&si->lock); |
362 | } else { | 364 | } else { |
363 | /* | 365 | /* |
364 | * Note pages allocated by racing tasks while | 366 | * Note pages allocated by racing tasks while |
@@ -374,14 +376,14 @@ checks: | |||
374 | return offset; | 376 | return offset; |
375 | 377 | ||
376 | scan: | 378 | scan: |
377 | spin_unlock(&swap_lock); | 379 | spin_unlock(&si->lock); |
378 | while (++offset <= si->highest_bit) { | 380 | while (++offset <= si->highest_bit) { |
379 | if (!si->swap_map[offset]) { | 381 | if (!si->swap_map[offset]) { |
380 | spin_lock(&swap_lock); | 382 | spin_lock(&si->lock); |
381 | goto checks; | 383 | goto checks; |
382 | } | 384 | } |
383 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 385 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
384 | spin_lock(&swap_lock); | 386 | spin_lock(&si->lock); |
385 | goto checks; | 387 | goto checks; |
386 | } | 388 | } |
387 | if (unlikely(--latency_ration < 0)) { | 389 | if (unlikely(--latency_ration < 0)) { |
@@ -392,11 +394,11 @@ scan: | |||
392 | offset = si->lowest_bit; | 394 | offset = si->lowest_bit; |
393 | while (++offset < scan_base) { | 395 | while (++offset < scan_base) { |
394 | if (!si->swap_map[offset]) { | 396 | if (!si->swap_map[offset]) { |
395 | spin_lock(&swap_lock); | 397 | spin_lock(&si->lock); |
396 | goto checks; | 398 | goto checks; |
397 | } | 399 | } |
398 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | 400 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { |
399 | spin_lock(&swap_lock); | 401 | spin_lock(&si->lock); |
400 | goto checks; | 402 | goto checks; |
401 | } | 403 | } |
402 | if (unlikely(--latency_ration < 0)) { | 404 | if (unlikely(--latency_ration < 0)) { |
@@ -404,7 +406,7 @@ scan: | |||
404 | latency_ration = LATENCY_LIMIT; | 406 | latency_ration = LATENCY_LIMIT; |
405 | } | 407 | } |
406 | } | 408 | } |
407 | spin_lock(&swap_lock); | 409 | spin_lock(&si->lock); |
408 | 410 | ||
409 | no_page: | 411 | no_page: |
410 | si->flags -= SWP_SCANNING; | 412 | si->flags -= SWP_SCANNING; |
@@ -417,13 +419,34 @@ swp_entry_t get_swap_page(void) | |||
417 | pgoff_t offset; | 419 | pgoff_t offset; |
418 | int type, next; | 420 | int type, next; |
419 | int wrapped = 0; | 421 | int wrapped = 0; |
422 | int hp_index; | ||
420 | 423 | ||
421 | spin_lock(&swap_lock); | 424 | spin_lock(&swap_lock); |
422 | if (nr_swap_pages <= 0) | 425 | if (atomic_long_read(&nr_swap_pages) <= 0) |
423 | goto noswap; | 426 | goto noswap; |
424 | nr_swap_pages--; | 427 | atomic_long_dec(&nr_swap_pages); |
425 | 428 | ||
426 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 429 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { |
430 | hp_index = atomic_xchg(&highest_priority_index, -1); | ||
431 | /* | ||
432 | * highest_priority_index records current highest priority swap | ||
433 | * type which just frees swap entries. If its priority is | ||
434 | * higher than that of swap_list.next swap type, we use it. It | ||
435 | * isn't protected by swap_lock, so it can be an invalid value | ||
436 | * if the corresponding swap type is swapoff. We double check | ||
437 | * the flags here. It's even possible the swap type is swapoff | ||
438 | * and swapon again and its priority is changed. In such rare | ||
439 | * case, low prority swap type might be used, but eventually | ||
440 | * high priority swap will be used after several rounds of | ||
441 | * swap. | ||
442 | */ | ||
443 | if (hp_index != -1 && hp_index != type && | ||
444 | swap_info[type]->prio < swap_info[hp_index]->prio && | ||
445 | (swap_info[hp_index]->flags & SWP_WRITEOK)) { | ||
446 | type = hp_index; | ||
447 | swap_list.next = type; | ||
448 | } | ||
449 | |||
427 | si = swap_info[type]; | 450 | si = swap_info[type]; |
428 | next = si->next; | 451 | next = si->next; |
429 | if (next < 0 || | 452 | if (next < 0 || |
@@ -432,22 +455,29 @@ swp_entry_t get_swap_page(void) | |||
432 | wrapped++; | 455 | wrapped++; |
433 | } | 456 | } |
434 | 457 | ||
435 | if (!si->highest_bit) | 458 | spin_lock(&si->lock); |
459 | if (!si->highest_bit) { | ||
460 | spin_unlock(&si->lock); | ||
436 | continue; | 461 | continue; |
437 | if (!(si->flags & SWP_WRITEOK)) | 462 | } |
463 | if (!(si->flags & SWP_WRITEOK)) { | ||
464 | spin_unlock(&si->lock); | ||
438 | continue; | 465 | continue; |
466 | } | ||
439 | 467 | ||
440 | swap_list.next = next; | 468 | swap_list.next = next; |
469 | |||
470 | spin_unlock(&swap_lock); | ||
441 | /* This is called for allocating swap entry for cache */ | 471 | /* This is called for allocating swap entry for cache */ |
442 | offset = scan_swap_map(si, SWAP_HAS_CACHE); | 472 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
443 | if (offset) { | 473 | spin_unlock(&si->lock); |
444 | spin_unlock(&swap_lock); | 474 | if (offset) |
445 | return swp_entry(type, offset); | 475 | return swp_entry(type, offset); |
446 | } | 476 | spin_lock(&swap_lock); |
447 | next = swap_list.next; | 477 | next = swap_list.next; |
448 | } | 478 | } |
449 | 479 | ||
450 | nr_swap_pages++; | 480 | atomic_long_inc(&nr_swap_pages); |
451 | noswap: | 481 | noswap: |
452 | spin_unlock(&swap_lock); | 482 | spin_unlock(&swap_lock); |
453 | return (swp_entry_t) {0}; | 483 | return (swp_entry_t) {0}; |
@@ -459,19 +489,19 @@ swp_entry_t get_swap_page_of_type(int type) | |||
459 | struct swap_info_struct *si; | 489 | struct swap_info_struct *si; |
460 | pgoff_t offset; | 490 | pgoff_t offset; |
461 | 491 | ||
462 | spin_lock(&swap_lock); | ||
463 | si = swap_info[type]; | 492 | si = swap_info[type]; |
493 | spin_lock(&si->lock); | ||
464 | if (si && (si->flags & SWP_WRITEOK)) { | 494 | if (si && (si->flags & SWP_WRITEOK)) { |
465 | nr_swap_pages--; | 495 | atomic_long_dec(&nr_swap_pages); |
466 | /* This is called for allocating swap entry, not cache */ | 496 | /* This is called for allocating swap entry, not cache */ |
467 | offset = scan_swap_map(si, 1); | 497 | offset = scan_swap_map(si, 1); |
468 | if (offset) { | 498 | if (offset) { |
469 | spin_unlock(&swap_lock); | 499 | spin_unlock(&si->lock); |
470 | return swp_entry(type, offset); | 500 | return swp_entry(type, offset); |
471 | } | 501 | } |
472 | nr_swap_pages++; | 502 | atomic_long_inc(&nr_swap_pages); |
473 | } | 503 | } |
474 | spin_unlock(&swap_lock); | 504 | spin_unlock(&si->lock); |
475 | return (swp_entry_t) {0}; | 505 | return (swp_entry_t) {0}; |
476 | } | 506 | } |
477 | 507 | ||
@@ -493,7 +523,7 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) | |||
493 | goto bad_offset; | 523 | goto bad_offset; |
494 | if (!p->swap_map[offset]) | 524 | if (!p->swap_map[offset]) |
495 | goto bad_free; | 525 | goto bad_free; |
496 | spin_lock(&swap_lock); | 526 | spin_lock(&p->lock); |
497 | return p; | 527 | return p; |
498 | 528 | ||
499 | bad_free: | 529 | bad_free: |
@@ -511,6 +541,27 @@ out: | |||
511 | return NULL; | 541 | return NULL; |
512 | } | 542 | } |
513 | 543 | ||
544 | /* | ||
545 | * This swap type frees swap entry, check if it is the highest priority swap | ||
546 | * type which just frees swap entry. get_swap_page() uses | ||
547 | * highest_priority_index to search highest priority swap type. The | ||
548 | * swap_info_struct.lock can't protect us if there are multiple swap types | ||
549 | * active, so we use atomic_cmpxchg. | ||
550 | */ | ||
551 | static void set_highest_priority_index(int type) | ||
552 | { | ||
553 | int old_hp_index, new_hp_index; | ||
554 | |||
555 | do { | ||
556 | old_hp_index = atomic_read(&highest_priority_index); | ||
557 | if (old_hp_index != -1 && | ||
558 | swap_info[old_hp_index]->prio >= swap_info[type]->prio) | ||
559 | break; | ||
560 | new_hp_index = type; | ||
561 | } while (atomic_cmpxchg(&highest_priority_index, | ||
562 | old_hp_index, new_hp_index) != old_hp_index); | ||
563 | } | ||
564 | |||
514 | static unsigned char swap_entry_free(struct swap_info_struct *p, | 565 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
515 | swp_entry_t entry, unsigned char usage) | 566 | swp_entry_t entry, unsigned char usage) |
516 | { | 567 | { |
@@ -553,10 +604,8 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
553 | p->lowest_bit = offset; | 604 | p->lowest_bit = offset; |
554 | if (offset > p->highest_bit) | 605 | if (offset > p->highest_bit) |
555 | p->highest_bit = offset; | 606 | p->highest_bit = offset; |
556 | if (swap_list.next >= 0 && | 607 | set_highest_priority_index(p->type); |
557 | p->prio > swap_info[swap_list.next]->prio) | 608 | atomic_long_inc(&nr_swap_pages); |
558 | swap_list.next = p->type; | ||
559 | nr_swap_pages++; | ||
560 | p->inuse_pages--; | 609 | p->inuse_pages--; |
561 | frontswap_invalidate_page(p->type, offset); | 610 | frontswap_invalidate_page(p->type, offset); |
562 | if (p->flags & SWP_BLKDEV) { | 611 | if (p->flags & SWP_BLKDEV) { |
@@ -581,7 +630,7 @@ void swap_free(swp_entry_t entry) | |||
581 | p = swap_info_get(entry); | 630 | p = swap_info_get(entry); |
582 | if (p) { | 631 | if (p) { |
583 | swap_entry_free(p, entry, 1); | 632 | swap_entry_free(p, entry, 1); |
584 | spin_unlock(&swap_lock); | 633 | spin_unlock(&p->lock); |
585 | } | 634 | } |
586 | } | 635 | } |
587 | 636 | ||
@@ -598,7 +647,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) | |||
598 | count = swap_entry_free(p, entry, SWAP_HAS_CACHE); | 647 | count = swap_entry_free(p, entry, SWAP_HAS_CACHE); |
599 | if (page) | 648 | if (page) |
600 | mem_cgroup_uncharge_swapcache(page, entry, count != 0); | 649 | mem_cgroup_uncharge_swapcache(page, entry, count != 0); |
601 | spin_unlock(&swap_lock); | 650 | spin_unlock(&p->lock); |
602 | } | 651 | } |
603 | } | 652 | } |
604 | 653 | ||
@@ -617,7 +666,7 @@ int page_swapcount(struct page *page) | |||
617 | p = swap_info_get(entry); | 666 | p = swap_info_get(entry); |
618 | if (p) { | 667 | if (p) { |
619 | count = swap_count(p->swap_map[swp_offset(entry)]); | 668 | count = swap_count(p->swap_map[swp_offset(entry)]); |
620 | spin_unlock(&swap_lock); | 669 | spin_unlock(&p->lock); |
621 | } | 670 | } |
622 | return count; | 671 | return count; |
623 | } | 672 | } |
@@ -699,13 +748,14 @@ int free_swap_and_cache(swp_entry_t entry) | |||
699 | p = swap_info_get(entry); | 748 | p = swap_info_get(entry); |
700 | if (p) { | 749 | if (p) { |
701 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { | 750 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { |
702 | page = find_get_page(&swapper_space, entry.val); | 751 | page = find_get_page(swap_address_space(entry), |
752 | entry.val); | ||
703 | if (page && !trylock_page(page)) { | 753 | if (page && !trylock_page(page)) { |
704 | page_cache_release(page); | 754 | page_cache_release(page); |
705 | page = NULL; | 755 | page = NULL; |
706 | } | 756 | } |
707 | } | 757 | } |
708 | spin_unlock(&swap_lock); | 758 | spin_unlock(&p->lock); |
709 | } | 759 | } |
710 | if (page) { | 760 | if (page) { |
711 | /* | 761 | /* |
@@ -803,11 +853,13 @@ unsigned int count_swap_pages(int type, int free) | |||
803 | if ((unsigned int)type < nr_swapfiles) { | 853 | if ((unsigned int)type < nr_swapfiles) { |
804 | struct swap_info_struct *sis = swap_info[type]; | 854 | struct swap_info_struct *sis = swap_info[type]; |
805 | 855 | ||
856 | spin_lock(&sis->lock); | ||
806 | if (sis->flags & SWP_WRITEOK) { | 857 | if (sis->flags & SWP_WRITEOK) { |
807 | n = sis->pages; | 858 | n = sis->pages; |
808 | if (free) | 859 | if (free) |
809 | n -= sis->inuse_pages; | 860 | n -= sis->inuse_pages; |
810 | } | 861 | } |
862 | spin_unlock(&sis->lock); | ||
811 | } | 863 | } |
812 | spin_unlock(&swap_lock); | 864 | spin_unlock(&swap_lock); |
813 | return n; | 865 | return n; |
@@ -822,11 +874,17 @@ unsigned int count_swap_pages(int type, int free) | |||
822 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | 874 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, |
823 | unsigned long addr, swp_entry_t entry, struct page *page) | 875 | unsigned long addr, swp_entry_t entry, struct page *page) |
824 | { | 876 | { |
877 | struct page *swapcache; | ||
825 | struct mem_cgroup *memcg; | 878 | struct mem_cgroup *memcg; |
826 | spinlock_t *ptl; | 879 | spinlock_t *ptl; |
827 | pte_t *pte; | 880 | pte_t *pte; |
828 | int ret = 1; | 881 | int ret = 1; |
829 | 882 | ||
883 | swapcache = page; | ||
884 | page = ksm_might_need_to_copy(page, vma, addr); | ||
885 | if (unlikely(!page)) | ||
886 | return -ENOMEM; | ||
887 | |||
830 | if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, | 888 | if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, |
831 | GFP_KERNEL, &memcg)) { | 889 | GFP_KERNEL, &memcg)) { |
832 | ret = -ENOMEM; | 890 | ret = -ENOMEM; |
@@ -845,7 +903,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
845 | get_page(page); | 903 | get_page(page); |
846 | set_pte_at(vma->vm_mm, addr, pte, | 904 | set_pte_at(vma->vm_mm, addr, pte, |
847 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 905 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
848 | page_add_anon_rmap(page, vma, addr); | 906 | if (page == swapcache) |
907 | page_add_anon_rmap(page, vma, addr); | ||
908 | else /* ksm created a completely new copy */ | ||
909 | page_add_new_anon_rmap(page, vma, addr); | ||
849 | mem_cgroup_commit_charge_swapin(page, memcg); | 910 | mem_cgroup_commit_charge_swapin(page, memcg); |
850 | swap_free(entry); | 911 | swap_free(entry); |
851 | /* | 912 | /* |
@@ -856,6 +917,10 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
856 | out: | 917 | out: |
857 | pte_unmap_unlock(pte, ptl); | 918 | pte_unmap_unlock(pte, ptl); |
858 | out_nolock: | 919 | out_nolock: |
920 | if (page != swapcache) { | ||
921 | unlock_page(page); | ||
922 | put_page(page); | ||
923 | } | ||
859 | return ret; | 924 | return ret; |
860 | } | 925 | } |
861 | 926 | ||
@@ -1456,7 +1521,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, | |||
1456 | p->swap_map = swap_map; | 1521 | p->swap_map = swap_map; |
1457 | frontswap_map_set(p, frontswap_map); | 1522 | frontswap_map_set(p, frontswap_map); |
1458 | p->flags |= SWP_WRITEOK; | 1523 | p->flags |= SWP_WRITEOK; |
1459 | nr_swap_pages += p->pages; | 1524 | atomic_long_add(p->pages, &nr_swap_pages); |
1460 | total_swap_pages += p->pages; | 1525 | total_swap_pages += p->pages; |
1461 | 1526 | ||
1462 | /* insert swap space into swap_list: */ | 1527 | /* insert swap space into swap_list: */ |
@@ -1478,15 +1543,19 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
1478 | unsigned long *frontswap_map) | 1543 | unsigned long *frontswap_map) |
1479 | { | 1544 | { |
1480 | spin_lock(&swap_lock); | 1545 | spin_lock(&swap_lock); |
1546 | spin_lock(&p->lock); | ||
1481 | _enable_swap_info(p, prio, swap_map, frontswap_map); | 1547 | _enable_swap_info(p, prio, swap_map, frontswap_map); |
1482 | frontswap_init(p->type); | 1548 | frontswap_init(p->type); |
1549 | spin_unlock(&p->lock); | ||
1483 | spin_unlock(&swap_lock); | 1550 | spin_unlock(&swap_lock); |
1484 | } | 1551 | } |
1485 | 1552 | ||
1486 | static void reinsert_swap_info(struct swap_info_struct *p) | 1553 | static void reinsert_swap_info(struct swap_info_struct *p) |
1487 | { | 1554 | { |
1488 | spin_lock(&swap_lock); | 1555 | spin_lock(&swap_lock); |
1556 | spin_lock(&p->lock); | ||
1489 | _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); | 1557 | _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); |
1558 | spin_unlock(&p->lock); | ||
1490 | spin_unlock(&swap_lock); | 1559 | spin_unlock(&swap_lock); |
1491 | } | 1560 | } |
1492 | 1561 | ||
@@ -1546,14 +1615,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1546 | /* just pick something that's safe... */ | 1615 | /* just pick something that's safe... */ |
1547 | swap_list.next = swap_list.head; | 1616 | swap_list.next = swap_list.head; |
1548 | } | 1617 | } |
1618 | spin_lock(&p->lock); | ||
1549 | if (p->prio < 0) { | 1619 | if (p->prio < 0) { |
1550 | for (i = p->next; i >= 0; i = swap_info[i]->next) | 1620 | for (i = p->next; i >= 0; i = swap_info[i]->next) |
1551 | swap_info[i]->prio = p->prio--; | 1621 | swap_info[i]->prio = p->prio--; |
1552 | least_priority++; | 1622 | least_priority++; |
1553 | } | 1623 | } |
1554 | nr_swap_pages -= p->pages; | 1624 | atomic_long_sub(p->pages, &nr_swap_pages); |
1555 | total_swap_pages -= p->pages; | 1625 | total_swap_pages -= p->pages; |
1556 | p->flags &= ~SWP_WRITEOK; | 1626 | p->flags &= ~SWP_WRITEOK; |
1627 | spin_unlock(&p->lock); | ||
1557 | spin_unlock(&swap_lock); | 1628 | spin_unlock(&swap_lock); |
1558 | 1629 | ||
1559 | set_current_oom_origin(); | 1630 | set_current_oom_origin(); |
@@ -1572,14 +1643,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1572 | 1643 | ||
1573 | mutex_lock(&swapon_mutex); | 1644 | mutex_lock(&swapon_mutex); |
1574 | spin_lock(&swap_lock); | 1645 | spin_lock(&swap_lock); |
1646 | spin_lock(&p->lock); | ||
1575 | drain_mmlist(); | 1647 | drain_mmlist(); |
1576 | 1648 | ||
1577 | /* wait for anyone still in scan_swap_map */ | 1649 | /* wait for anyone still in scan_swap_map */ |
1578 | p->highest_bit = 0; /* cuts scans short */ | 1650 | p->highest_bit = 0; /* cuts scans short */ |
1579 | while (p->flags >= SWP_SCANNING) { | 1651 | while (p->flags >= SWP_SCANNING) { |
1652 | spin_unlock(&p->lock); | ||
1580 | spin_unlock(&swap_lock); | 1653 | spin_unlock(&swap_lock); |
1581 | schedule_timeout_uninterruptible(1); | 1654 | schedule_timeout_uninterruptible(1); |
1582 | spin_lock(&swap_lock); | 1655 | spin_lock(&swap_lock); |
1656 | spin_lock(&p->lock); | ||
1583 | } | 1657 | } |
1584 | 1658 | ||
1585 | swap_file = p->swap_file; | 1659 | swap_file = p->swap_file; |
@@ -1589,6 +1663,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1589 | p->swap_map = NULL; | 1663 | p->swap_map = NULL; |
1590 | p->flags = 0; | 1664 | p->flags = 0; |
1591 | frontswap_invalidate_area(type); | 1665 | frontswap_invalidate_area(type); |
1666 | spin_unlock(&p->lock); | ||
1592 | spin_unlock(&swap_lock); | 1667 | spin_unlock(&swap_lock); |
1593 | mutex_unlock(&swapon_mutex); | 1668 | mutex_unlock(&swapon_mutex); |
1594 | vfree(swap_map); | 1669 | vfree(swap_map); |
@@ -1794,6 +1869,7 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
1794 | p->flags = SWP_USED; | 1869 | p->flags = SWP_USED; |
1795 | p->next = -1; | 1870 | p->next = -1; |
1796 | spin_unlock(&swap_lock); | 1871 | spin_unlock(&swap_lock); |
1872 | spin_lock_init(&p->lock); | ||
1797 | 1873 | ||
1798 | return p; | 1874 | return p; |
1799 | } | 1875 | } |
@@ -2116,7 +2192,7 @@ void si_swapinfo(struct sysinfo *val) | |||
2116 | if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) | 2192 | if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) |
2117 | nr_to_be_unused += si->inuse_pages; | 2193 | nr_to_be_unused += si->inuse_pages; |
2118 | } | 2194 | } |
2119 | val->freeswap = nr_swap_pages + nr_to_be_unused; | 2195 | val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; |
2120 | val->totalswap = total_swap_pages + nr_to_be_unused; | 2196 | val->totalswap = total_swap_pages + nr_to_be_unused; |
2121 | spin_unlock(&swap_lock); | 2197 | spin_unlock(&swap_lock); |
2122 | } | 2198 | } |
@@ -2149,7 +2225,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
2149 | p = swap_info[type]; | 2225 | p = swap_info[type]; |
2150 | offset = swp_offset(entry); | 2226 | offset = swp_offset(entry); |
2151 | 2227 | ||
2152 | spin_lock(&swap_lock); | 2228 | spin_lock(&p->lock); |
2153 | if (unlikely(offset >= p->max)) | 2229 | if (unlikely(offset >= p->max)) |
2154 | goto unlock_out; | 2230 | goto unlock_out; |
2155 | 2231 | ||
@@ -2184,7 +2260,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
2184 | p->swap_map[offset] = count | has_cache; | 2260 | p->swap_map[offset] = count | has_cache; |
2185 | 2261 | ||
2186 | unlock_out: | 2262 | unlock_out: |
2187 | spin_unlock(&swap_lock); | 2263 | spin_unlock(&p->lock); |
2188 | out: | 2264 | out: |
2189 | return err; | 2265 | return err; |
2190 | 2266 | ||
@@ -2309,7 +2385,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | |||
2309 | } | 2385 | } |
2310 | 2386 | ||
2311 | if (!page) { | 2387 | if (!page) { |
2312 | spin_unlock(&swap_lock); | 2388 | spin_unlock(&si->lock); |
2313 | return -ENOMEM; | 2389 | return -ENOMEM; |
2314 | } | 2390 | } |
2315 | 2391 | ||
@@ -2357,7 +2433,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | |||
2357 | list_add_tail(&page->lru, &head->lru); | 2433 | list_add_tail(&page->lru, &head->lru); |
2358 | page = NULL; /* now it's attached, don't free it */ | 2434 | page = NULL; /* now it's attached, don't free it */ |
2359 | out: | 2435 | out: |
2360 | spin_unlock(&swap_lock); | 2436 | spin_unlock(&si->lock); |
2361 | outer: | 2437 | outer: |
2362 | if (page) | 2438 | if (page) |
2363 | __free_page(page); | 2439 | __free_page(page); |
@@ -5,6 +5,8 @@ | |||
5 | #include <linux/err.h> | 5 | #include <linux/err.h> |
6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
7 | #include <linux/security.h> | 7 | #include <linux/security.h> |
8 | #include <linux/swap.h> | ||
9 | #include <linux/swapops.h> | ||
8 | #include <asm/uaccess.h> | 10 | #include <asm/uaccess.h> |
9 | 11 | ||
10 | #include "internal.h" | 12 | #include "internal.h" |
@@ -355,12 +357,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, | |||
355 | { | 357 | { |
356 | unsigned long ret; | 358 | unsigned long ret; |
357 | struct mm_struct *mm = current->mm; | 359 | struct mm_struct *mm = current->mm; |
360 | unsigned long populate; | ||
358 | 361 | ||
359 | ret = security_mmap_file(file, prot, flag); | 362 | ret = security_mmap_file(file, prot, flag); |
360 | if (!ret) { | 363 | if (!ret) { |
361 | down_write(&mm->mmap_sem); | 364 | down_write(&mm->mmap_sem); |
362 | ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); | 365 | ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, |
366 | &populate); | ||
363 | up_write(&mm->mmap_sem); | 367 | up_write(&mm->mmap_sem); |
368 | if (populate) | ||
369 | mm_populate(ret, populate); | ||
364 | } | 370 | } |
365 | return ret; | 371 | return ret; |
366 | } | 372 | } |
@@ -378,6 +384,24 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, | |||
378 | } | 384 | } |
379 | EXPORT_SYMBOL(vm_mmap); | 385 | EXPORT_SYMBOL(vm_mmap); |
380 | 386 | ||
387 | struct address_space *page_mapping(struct page *page) | ||
388 | { | ||
389 | struct address_space *mapping = page->mapping; | ||
390 | |||
391 | VM_BUG_ON(PageSlab(page)); | ||
392 | #ifdef CONFIG_SWAP | ||
393 | if (unlikely(PageSwapCache(page))) { | ||
394 | swp_entry_t entry; | ||
395 | |||
396 | entry.val = page_private(page); | ||
397 | mapping = swap_address_space(entry); | ||
398 | } else | ||
399 | #endif | ||
400 | if ((unsigned long)mapping & PAGE_MAPPING_ANON) | ||
401 | mapping = NULL; | ||
402 | return mapping; | ||
403 | } | ||
404 | |||
381 | /* Tracepoints definitions. */ | 405 | /* Tracepoints definitions. */ |
382 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | 406 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); |
383 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | 407 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5123a169ab7b..0f751f2068c3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1376,8 +1376,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1376 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 1376 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
1377 | unsigned long start, unsigned long end) | 1377 | unsigned long start, unsigned long end) |
1378 | { | 1378 | { |
1379 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, | 1379 | return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, |
1380 | __builtin_return_address(0)); | 1380 | GFP_KERNEL, __builtin_return_address(0)); |
1381 | } | 1381 | } |
1382 | EXPORT_SYMBOL_GPL(__get_vm_area); | 1382 | EXPORT_SYMBOL_GPL(__get_vm_area); |
1383 | 1383 | ||
@@ -1385,8 +1385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, | |||
1385 | unsigned long start, unsigned long end, | 1385 | unsigned long start, unsigned long end, |
1386 | const void *caller) | 1386 | const void *caller) |
1387 | { | 1387 | { |
1388 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, | 1388 | return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, |
1389 | caller); | 1389 | GFP_KERNEL, caller); |
1390 | } | 1390 | } |
1391 | 1391 | ||
1392 | /** | 1392 | /** |
@@ -1401,14 +1401,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, | |||
1401 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) | 1401 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) |
1402 | { | 1402 | { |
1403 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, | 1403 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
1404 | -1, GFP_KERNEL, __builtin_return_address(0)); | 1404 | NUMA_NO_NODE, GFP_KERNEL, |
1405 | __builtin_return_address(0)); | ||
1405 | } | 1406 | } |
1406 | 1407 | ||
1407 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, | 1408 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, |
1408 | const void *caller) | 1409 | const void *caller) |
1409 | { | 1410 | { |
1410 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, | 1411 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
1411 | -1, GFP_KERNEL, caller); | 1412 | NUMA_NO_NODE, GFP_KERNEL, caller); |
1412 | } | 1413 | } |
1413 | 1414 | ||
1414 | /** | 1415 | /** |
@@ -1650,7 +1651,7 @@ fail: | |||
1650 | * @end: vm area range end | 1651 | * @end: vm area range end |
1651 | * @gfp_mask: flags for the page level allocator | 1652 | * @gfp_mask: flags for the page level allocator |
1652 | * @prot: protection mask for the allocated pages | 1653 | * @prot: protection mask for the allocated pages |
1653 | * @node: node to use for allocation or -1 | 1654 | * @node: node to use for allocation or NUMA_NO_NODE |
1654 | * @caller: caller's return address | 1655 | * @caller: caller's return address |
1655 | * | 1656 | * |
1656 | * Allocate enough pages to cover @size from the page level | 1657 | * Allocate enough pages to cover @size from the page level |
@@ -1706,7 +1707,7 @@ fail: | |||
1706 | * @align: desired alignment | 1707 | * @align: desired alignment |
1707 | * @gfp_mask: flags for the page level allocator | 1708 | * @gfp_mask: flags for the page level allocator |
1708 | * @prot: protection mask for the allocated pages | 1709 | * @prot: protection mask for the allocated pages |
1709 | * @node: node to use for allocation or -1 | 1710 | * @node: node to use for allocation or NUMA_NO_NODE |
1710 | * @caller: caller's return address | 1711 | * @caller: caller's return address |
1711 | * | 1712 | * |
1712 | * Allocate enough pages to cover @size from the page level | 1713 | * Allocate enough pages to cover @size from the page level |
@@ -1723,7 +1724,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
1723 | 1724 | ||
1724 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 1725 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
1725 | { | 1726 | { |
1726 | return __vmalloc_node(size, 1, gfp_mask, prot, -1, | 1727 | return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, |
1727 | __builtin_return_address(0)); | 1728 | __builtin_return_address(0)); |
1728 | } | 1729 | } |
1729 | EXPORT_SYMBOL(__vmalloc); | 1730 | EXPORT_SYMBOL(__vmalloc); |
@@ -1746,7 +1747,8 @@ static inline void *__vmalloc_node_flags(unsigned long size, | |||
1746 | */ | 1747 | */ |
1747 | void *vmalloc(unsigned long size) | 1748 | void *vmalloc(unsigned long size) |
1748 | { | 1749 | { |
1749 | return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); | 1750 | return __vmalloc_node_flags(size, NUMA_NO_NODE, |
1751 | GFP_KERNEL | __GFP_HIGHMEM); | ||
1750 | } | 1752 | } |
1751 | EXPORT_SYMBOL(vmalloc); | 1753 | EXPORT_SYMBOL(vmalloc); |
1752 | 1754 | ||
@@ -1762,7 +1764,7 @@ EXPORT_SYMBOL(vmalloc); | |||
1762 | */ | 1764 | */ |
1763 | void *vzalloc(unsigned long size) | 1765 | void *vzalloc(unsigned long size) |
1764 | { | 1766 | { |
1765 | return __vmalloc_node_flags(size, -1, | 1767 | return __vmalloc_node_flags(size, NUMA_NO_NODE, |
1766 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); | 1768 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); |
1767 | } | 1769 | } |
1768 | EXPORT_SYMBOL(vzalloc); | 1770 | EXPORT_SYMBOL(vzalloc); |
@@ -1781,7 +1783,8 @@ void *vmalloc_user(unsigned long size) | |||
1781 | 1783 | ||
1782 | ret = __vmalloc_node(size, SHMLBA, | 1784 | ret = __vmalloc_node(size, SHMLBA, |
1783 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | 1785 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, |
1784 | PAGE_KERNEL, -1, __builtin_return_address(0)); | 1786 | PAGE_KERNEL, NUMA_NO_NODE, |
1787 | __builtin_return_address(0)); | ||
1785 | if (ret) { | 1788 | if (ret) { |
1786 | area = find_vm_area(ret); | 1789 | area = find_vm_area(ret); |
1787 | area->flags |= VM_USERMAP; | 1790 | area->flags |= VM_USERMAP; |
@@ -1846,7 +1849,7 @@ EXPORT_SYMBOL(vzalloc_node); | |||
1846 | void *vmalloc_exec(unsigned long size) | 1849 | void *vmalloc_exec(unsigned long size) |
1847 | { | 1850 | { |
1848 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, | 1851 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, |
1849 | -1, __builtin_return_address(0)); | 1852 | NUMA_NO_NODE, __builtin_return_address(0)); |
1850 | } | 1853 | } |
1851 | 1854 | ||
1852 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) | 1855 | #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) |
@@ -1867,7 +1870,7 @@ void *vmalloc_exec(unsigned long size) | |||
1867 | void *vmalloc_32(unsigned long size) | 1870 | void *vmalloc_32(unsigned long size) |
1868 | { | 1871 | { |
1869 | return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, | 1872 | return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, |
1870 | -1, __builtin_return_address(0)); | 1873 | NUMA_NO_NODE, __builtin_return_address(0)); |
1871 | } | 1874 | } |
1872 | EXPORT_SYMBOL(vmalloc_32); | 1875 | EXPORT_SYMBOL(vmalloc_32); |
1873 | 1876 | ||
@@ -1884,7 +1887,7 @@ void *vmalloc_32_user(unsigned long size) | |||
1884 | void *ret; | 1887 | void *ret; |
1885 | 1888 | ||
1886 | ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, | 1889 | ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, |
1887 | -1, __builtin_return_address(0)); | 1890 | NUMA_NO_NODE, __builtin_return_address(0)); |
1888 | if (ret) { | 1891 | if (ret) { |
1889 | area = find_vm_area(ret); | 1892 | area = find_vm_area(ret); |
1890 | area->flags |= VM_USERMAP; | 1893 | area->flags |= VM_USERMAP; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 196709f5ee58..88c5fed8b9a4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -128,7 +128,7 @@ struct scan_control { | |||
128 | * From 0 .. 100. Higher means more swappy. | 128 | * From 0 .. 100. Higher means more swappy. |
129 | */ | 129 | */ |
130 | int vm_swappiness = 60; | 130 | int vm_swappiness = 60; |
131 | long vm_total_pages; /* The total number of pages which the VM controls */ | 131 | unsigned long vm_total_pages; /* The total number of pages which the VM controls */ |
132 | 132 | ||
133 | static LIST_HEAD(shrinker_list); | 133 | static LIST_HEAD(shrinker_list); |
134 | static DECLARE_RWSEM(shrinker_rwsem); | 134 | static DECLARE_RWSEM(shrinker_rwsem); |
@@ -1579,16 +1579,6 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec) | |||
1579 | } | 1579 | } |
1580 | #endif | 1580 | #endif |
1581 | 1581 | ||
1582 | static int inactive_file_is_low_global(struct zone *zone) | ||
1583 | { | ||
1584 | unsigned long active, inactive; | ||
1585 | |||
1586 | active = zone_page_state(zone, NR_ACTIVE_FILE); | ||
1587 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); | ||
1588 | |||
1589 | return (active > inactive); | ||
1590 | } | ||
1591 | |||
1592 | /** | 1582 | /** |
1593 | * inactive_file_is_low - check if file pages need to be deactivated | 1583 | * inactive_file_is_low - check if file pages need to be deactivated |
1594 | * @lruvec: LRU vector to check | 1584 | * @lruvec: LRU vector to check |
@@ -1605,10 +1595,13 @@ static int inactive_file_is_low_global(struct zone *zone) | |||
1605 | */ | 1595 | */ |
1606 | static int inactive_file_is_low(struct lruvec *lruvec) | 1596 | static int inactive_file_is_low(struct lruvec *lruvec) |
1607 | { | 1597 | { |
1608 | if (!mem_cgroup_disabled()) | 1598 | unsigned long inactive; |
1609 | return mem_cgroup_inactive_file_is_low(lruvec); | 1599 | unsigned long active; |
1600 | |||
1601 | inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); | ||
1602 | active = get_lru_size(lruvec, LRU_ACTIVE_FILE); | ||
1610 | 1603 | ||
1611 | return inactive_file_is_low_global(lruvec_zone(lruvec)); | 1604 | return active > inactive; |
1612 | } | 1605 | } |
1613 | 1606 | ||
1614 | static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) | 1607 | static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) |
@@ -1638,6 +1631,13 @@ static int vmscan_swappiness(struct scan_control *sc) | |||
1638 | return mem_cgroup_swappiness(sc->target_mem_cgroup); | 1631 | return mem_cgroup_swappiness(sc->target_mem_cgroup); |
1639 | } | 1632 | } |
1640 | 1633 | ||
1634 | enum scan_balance { | ||
1635 | SCAN_EQUAL, | ||
1636 | SCAN_FRACT, | ||
1637 | SCAN_ANON, | ||
1638 | SCAN_FILE, | ||
1639 | }; | ||
1640 | |||
1641 | /* | 1641 | /* |
1642 | * Determine how aggressively the anon and file LRU lists should be | 1642 | * Determine how aggressively the anon and file LRU lists should be |
1643 | * scanned. The relative value of each set of LRU lists is determined | 1643 | * scanned. The relative value of each set of LRU lists is determined |
@@ -1650,15 +1650,16 @@ static int vmscan_swappiness(struct scan_control *sc) | |||
1650 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | 1650 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, |
1651 | unsigned long *nr) | 1651 | unsigned long *nr) |
1652 | { | 1652 | { |
1653 | unsigned long anon, file, free; | 1653 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1654 | u64 fraction[2]; | ||
1655 | u64 denominator = 0; /* gcc */ | ||
1656 | struct zone *zone = lruvec_zone(lruvec); | ||
1654 | unsigned long anon_prio, file_prio; | 1657 | unsigned long anon_prio, file_prio; |
1658 | enum scan_balance scan_balance; | ||
1659 | unsigned long anon, file, free; | ||
1660 | bool force_scan = false; | ||
1655 | unsigned long ap, fp; | 1661 | unsigned long ap, fp; |
1656 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | ||
1657 | u64 fraction[2], denominator; | ||
1658 | enum lru_list lru; | 1662 | enum lru_list lru; |
1659 | int noswap = 0; | ||
1660 | bool force_scan = false; | ||
1661 | struct zone *zone = lruvec_zone(lruvec); | ||
1662 | 1663 | ||
1663 | /* | 1664 | /* |
1664 | * If the zone or memcg is small, nr[l] can be 0. This | 1665 | * If the zone or memcg is small, nr[l] can be 0. This |
@@ -1676,11 +1677,30 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1676 | force_scan = true; | 1677 | force_scan = true; |
1677 | 1678 | ||
1678 | /* If we have no swap space, do not bother scanning anon pages. */ | 1679 | /* If we have no swap space, do not bother scanning anon pages. */ |
1679 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1680 | if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { |
1680 | noswap = 1; | 1681 | scan_balance = SCAN_FILE; |
1681 | fraction[0] = 0; | 1682 | goto out; |
1682 | fraction[1] = 1; | 1683 | } |
1683 | denominator = 1; | 1684 | |
1685 | /* | ||
1686 | * Global reclaim will swap to prevent OOM even with no | ||
1687 | * swappiness, but memcg users want to use this knob to | ||
1688 | * disable swapping for individual groups completely when | ||
1689 | * using the memory controller's swap limit feature would be | ||
1690 | * too expensive. | ||
1691 | */ | ||
1692 | if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { | ||
1693 | scan_balance = SCAN_FILE; | ||
1694 | goto out; | ||
1695 | } | ||
1696 | |||
1697 | /* | ||
1698 | * Do not apply any pressure balancing cleverness when the | ||
1699 | * system is close to OOM, scan both anon and file equally | ||
1700 | * (unless the swappiness setting disagrees with swapping). | ||
1701 | */ | ||
1702 | if (!sc->priority && vmscan_swappiness(sc)) { | ||
1703 | scan_balance = SCAN_EQUAL; | ||
1684 | goto out; | 1704 | goto out; |
1685 | } | 1705 | } |
1686 | 1706 | ||
@@ -1689,30 +1709,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1689 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + | 1709 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + |
1690 | get_lru_size(lruvec, LRU_INACTIVE_FILE); | 1710 | get_lru_size(lruvec, LRU_INACTIVE_FILE); |
1691 | 1711 | ||
1712 | /* | ||
1713 | * If it's foreseeable that reclaiming the file cache won't be | ||
1714 | * enough to get the zone back into a desirable shape, we have | ||
1715 | * to swap. Better start now and leave the - probably heavily | ||
1716 | * thrashing - remaining file pages alone. | ||
1717 | */ | ||
1692 | if (global_reclaim(sc)) { | 1718 | if (global_reclaim(sc)) { |
1693 | free = zone_page_state(zone, NR_FREE_PAGES); | 1719 | free = zone_page_state(zone, NR_FREE_PAGES); |
1694 | if (unlikely(file + free <= high_wmark_pages(zone))) { | 1720 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
1695 | /* | 1721 | scan_balance = SCAN_ANON; |
1696 | * If we have very few page cache pages, force-scan | ||
1697 | * anon pages. | ||
1698 | */ | ||
1699 | fraction[0] = 1; | ||
1700 | fraction[1] = 0; | ||
1701 | denominator = 1; | ||
1702 | goto out; | ||
1703 | } else if (!inactive_file_is_low_global(zone)) { | ||
1704 | /* | ||
1705 | * There is enough inactive page cache, do not | ||
1706 | * reclaim anything from the working set right now. | ||
1707 | */ | ||
1708 | fraction[0] = 0; | ||
1709 | fraction[1] = 1; | ||
1710 | denominator = 1; | ||
1711 | goto out; | 1722 | goto out; |
1712 | } | 1723 | } |
1713 | } | 1724 | } |
1714 | 1725 | ||
1715 | /* | 1726 | /* |
1727 | * There is enough inactive page cache, do not reclaim | ||
1728 | * anything from the anonymous working set right now. | ||
1729 | */ | ||
1730 | if (!inactive_file_is_low(lruvec)) { | ||
1731 | scan_balance = SCAN_FILE; | ||
1732 | goto out; | ||
1733 | } | ||
1734 | |||
1735 | scan_balance = SCAN_FRACT; | ||
1736 | |||
1737 | /* | ||
1716 | * With swappiness at 100, anonymous and file have the same priority. | 1738 | * With swappiness at 100, anonymous and file have the same priority. |
1717 | * This scanning priority is essentially the inverse of IO cost. | 1739 | * This scanning priority is essentially the inverse of IO cost. |
1718 | */ | 1740 | */ |
@@ -1759,19 +1781,92 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1759 | out: | 1781 | out: |
1760 | for_each_evictable_lru(lru) { | 1782 | for_each_evictable_lru(lru) { |
1761 | int file = is_file_lru(lru); | 1783 | int file = is_file_lru(lru); |
1784 | unsigned long size; | ||
1762 | unsigned long scan; | 1785 | unsigned long scan; |
1763 | 1786 | ||
1764 | scan = get_lru_size(lruvec, lru); | 1787 | size = get_lru_size(lruvec, lru); |
1765 | if (sc->priority || noswap || !vmscan_swappiness(sc)) { | 1788 | scan = size >> sc->priority; |
1766 | scan >>= sc->priority; | 1789 | |
1767 | if (!scan && force_scan) | 1790 | if (!scan && force_scan) |
1768 | scan = SWAP_CLUSTER_MAX; | 1791 | scan = min(size, SWAP_CLUSTER_MAX); |
1792 | |||
1793 | switch (scan_balance) { | ||
1794 | case SCAN_EQUAL: | ||
1795 | /* Scan lists relative to size */ | ||
1796 | break; | ||
1797 | case SCAN_FRACT: | ||
1798 | /* | ||
1799 | * Scan types proportional to swappiness and | ||
1800 | * their relative recent reclaim efficiency. | ||
1801 | */ | ||
1769 | scan = div64_u64(scan * fraction[file], denominator); | 1802 | scan = div64_u64(scan * fraction[file], denominator); |
1803 | break; | ||
1804 | case SCAN_FILE: | ||
1805 | case SCAN_ANON: | ||
1806 | /* Scan one type exclusively */ | ||
1807 | if ((scan_balance == SCAN_FILE) != file) | ||
1808 | scan = 0; | ||
1809 | break; | ||
1810 | default: | ||
1811 | /* Look ma, no brain */ | ||
1812 | BUG(); | ||
1770 | } | 1813 | } |
1771 | nr[lru] = scan; | 1814 | nr[lru] = scan; |
1772 | } | 1815 | } |
1773 | } | 1816 | } |
1774 | 1817 | ||
1818 | /* | ||
1819 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | ||
1820 | */ | ||
1821 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | ||
1822 | { | ||
1823 | unsigned long nr[NR_LRU_LISTS]; | ||
1824 | unsigned long nr_to_scan; | ||
1825 | enum lru_list lru; | ||
1826 | unsigned long nr_reclaimed = 0; | ||
1827 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | ||
1828 | struct blk_plug plug; | ||
1829 | |||
1830 | get_scan_count(lruvec, sc, nr); | ||
1831 | |||
1832 | blk_start_plug(&plug); | ||
1833 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | ||
1834 | nr[LRU_INACTIVE_FILE]) { | ||
1835 | for_each_evictable_lru(lru) { | ||
1836 | if (nr[lru]) { | ||
1837 | nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); | ||
1838 | nr[lru] -= nr_to_scan; | ||
1839 | |||
1840 | nr_reclaimed += shrink_list(lru, nr_to_scan, | ||
1841 | lruvec, sc); | ||
1842 | } | ||
1843 | } | ||
1844 | /* | ||
1845 | * On large memory systems, scan >> priority can become | ||
1846 | * really large. This is fine for the starting priority; | ||
1847 | * we want to put equal scanning pressure on each zone. | ||
1848 | * However, if the VM has a harder time of freeing pages, | ||
1849 | * with multiple processes reclaiming pages, the total | ||
1850 | * freeing target can get unreasonably large. | ||
1851 | */ | ||
1852 | if (nr_reclaimed >= nr_to_reclaim && | ||
1853 | sc->priority < DEF_PRIORITY) | ||
1854 | break; | ||
1855 | } | ||
1856 | blk_finish_plug(&plug); | ||
1857 | sc->nr_reclaimed += nr_reclaimed; | ||
1858 | |||
1859 | /* | ||
1860 | * Even if we did not try to evict anon pages at all, we want to | ||
1861 | * rebalance the anon lru active/inactive ratio. | ||
1862 | */ | ||
1863 | if (inactive_anon_is_low(lruvec)) | ||
1864 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | ||
1865 | sc, LRU_ACTIVE_ANON); | ||
1866 | |||
1867 | throttle_vm_writeout(sc->gfp_mask); | ||
1868 | } | ||
1869 | |||
1775 | /* Use reclaim/compaction for costly allocs or under memory pressure */ | 1870 | /* Use reclaim/compaction for costly allocs or under memory pressure */ |
1776 | static bool in_reclaim_compaction(struct scan_control *sc) | 1871 | static bool in_reclaim_compaction(struct scan_control *sc) |
1777 | { | 1872 | { |
@@ -1790,7 +1885,7 @@ static bool in_reclaim_compaction(struct scan_control *sc) | |||
1790 | * calls try_to_compact_zone() that it will have enough free pages to succeed. | 1885 | * calls try_to_compact_zone() that it will have enough free pages to succeed. |
1791 | * It will give up earlier than that if there is difficulty reclaiming pages. | 1886 | * It will give up earlier than that if there is difficulty reclaiming pages. |
1792 | */ | 1887 | */ |
1793 | static inline bool should_continue_reclaim(struct lruvec *lruvec, | 1888 | static inline bool should_continue_reclaim(struct zone *zone, |
1794 | unsigned long nr_reclaimed, | 1889 | unsigned long nr_reclaimed, |
1795 | unsigned long nr_scanned, | 1890 | unsigned long nr_scanned, |
1796 | struct scan_control *sc) | 1891 | struct scan_control *sc) |
@@ -1830,15 +1925,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, | |||
1830 | * inactive lists are large enough, continue reclaiming | 1925 | * inactive lists are large enough, continue reclaiming |
1831 | */ | 1926 | */ |
1832 | pages_for_compaction = (2UL << sc->order); | 1927 | pages_for_compaction = (2UL << sc->order); |
1833 | inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); | 1928 | inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); |
1834 | if (nr_swap_pages > 0) | 1929 | if (get_nr_swap_pages() > 0) |
1835 | inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); | 1930 | inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); |
1836 | if (sc->nr_reclaimed < pages_for_compaction && | 1931 | if (sc->nr_reclaimed < pages_for_compaction && |
1837 | inactive_lru_pages > pages_for_compaction) | 1932 | inactive_lru_pages > pages_for_compaction) |
1838 | return true; | 1933 | return true; |
1839 | 1934 | ||
1840 | /* If compaction would go ahead or the allocation would succeed, stop */ | 1935 | /* If compaction would go ahead or the allocation would succeed, stop */ |
1841 | switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) { | 1936 | switch (compaction_suitable(zone, sc->order)) { |
1842 | case COMPACT_PARTIAL: | 1937 | case COMPACT_PARTIAL: |
1843 | case COMPACT_CONTINUE: | 1938 | case COMPACT_CONTINUE: |
1844 | return false; | 1939 | return false; |
@@ -1847,98 +1942,48 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, | |||
1847 | } | 1942 | } |
1848 | } | 1943 | } |
1849 | 1944 | ||
1850 | /* | 1945 | static void shrink_zone(struct zone *zone, struct scan_control *sc) |
1851 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | ||
1852 | */ | ||
1853 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | ||
1854 | { | 1946 | { |
1855 | unsigned long nr[NR_LRU_LISTS]; | ||
1856 | unsigned long nr_to_scan; | ||
1857 | enum lru_list lru; | ||
1858 | unsigned long nr_reclaimed, nr_scanned; | 1947 | unsigned long nr_reclaimed, nr_scanned; |
1859 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | ||
1860 | struct blk_plug plug; | ||
1861 | |||
1862 | restart: | ||
1863 | nr_reclaimed = 0; | ||
1864 | nr_scanned = sc->nr_scanned; | ||
1865 | get_scan_count(lruvec, sc, nr); | ||
1866 | |||
1867 | blk_start_plug(&plug); | ||
1868 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | ||
1869 | nr[LRU_INACTIVE_FILE]) { | ||
1870 | for_each_evictable_lru(lru) { | ||
1871 | if (nr[lru]) { | ||
1872 | nr_to_scan = min_t(unsigned long, | ||
1873 | nr[lru], SWAP_CLUSTER_MAX); | ||
1874 | nr[lru] -= nr_to_scan; | ||
1875 | |||
1876 | nr_reclaimed += shrink_list(lru, nr_to_scan, | ||
1877 | lruvec, sc); | ||
1878 | } | ||
1879 | } | ||
1880 | /* | ||
1881 | * On large memory systems, scan >> priority can become | ||
1882 | * really large. This is fine for the starting priority; | ||
1883 | * we want to put equal scanning pressure on each zone. | ||
1884 | * However, if the VM has a harder time of freeing pages, | ||
1885 | * with multiple processes reclaiming pages, the total | ||
1886 | * freeing target can get unreasonably large. | ||
1887 | */ | ||
1888 | if (nr_reclaimed >= nr_to_reclaim && | ||
1889 | sc->priority < DEF_PRIORITY) | ||
1890 | break; | ||
1891 | } | ||
1892 | blk_finish_plug(&plug); | ||
1893 | sc->nr_reclaimed += nr_reclaimed; | ||
1894 | 1948 | ||
1895 | /* | 1949 | do { |
1896 | * Even if we did not try to evict anon pages at all, we want to | 1950 | struct mem_cgroup *root = sc->target_mem_cgroup; |
1897 | * rebalance the anon lru active/inactive ratio. | 1951 | struct mem_cgroup_reclaim_cookie reclaim = { |
1898 | */ | 1952 | .zone = zone, |
1899 | if (inactive_anon_is_low(lruvec)) | 1953 | .priority = sc->priority, |
1900 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | 1954 | }; |
1901 | sc, LRU_ACTIVE_ANON); | 1955 | struct mem_cgroup *memcg; |
1902 | |||
1903 | /* reclaim/compaction might need reclaim to continue */ | ||
1904 | if (should_continue_reclaim(lruvec, nr_reclaimed, | ||
1905 | sc->nr_scanned - nr_scanned, sc)) | ||
1906 | goto restart; | ||
1907 | 1956 | ||
1908 | throttle_vm_writeout(sc->gfp_mask); | 1957 | nr_reclaimed = sc->nr_reclaimed; |
1909 | } | 1958 | nr_scanned = sc->nr_scanned; |
1910 | 1959 | ||
1911 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | 1960 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
1912 | { | 1961 | do { |
1913 | struct mem_cgroup *root = sc->target_mem_cgroup; | 1962 | struct lruvec *lruvec; |
1914 | struct mem_cgroup_reclaim_cookie reclaim = { | ||
1915 | .zone = zone, | ||
1916 | .priority = sc->priority, | ||
1917 | }; | ||
1918 | struct mem_cgroup *memcg; | ||
1919 | 1963 | ||
1920 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 1964 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
1921 | do { | ||
1922 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); | ||
1923 | 1965 | ||
1924 | shrink_lruvec(lruvec, sc); | 1966 | shrink_lruvec(lruvec, sc); |
1925 | 1967 | ||
1926 | /* | 1968 | /* |
1927 | * Limit reclaim has historically picked one memcg and | 1969 | * Direct reclaim and kswapd have to scan all memory |
1928 | * scanned it with decreasing priority levels until | 1970 | * cgroups to fulfill the overall scan target for the |
1929 | * nr_to_reclaim had been reclaimed. This priority | 1971 | * zone. |
1930 | * cycle is thus over after a single memcg. | 1972 | * |
1931 | * | 1973 | * Limit reclaim, on the other hand, only cares about |
1932 | * Direct reclaim and kswapd, on the other hand, have | 1974 | * nr_to_reclaim pages to be reclaimed and it will |
1933 | * to scan all memory cgroups to fulfill the overall | 1975 | * retry with decreasing priority if one round over the |
1934 | * scan target for the zone. | 1976 | * whole hierarchy is not sufficient. |
1935 | */ | 1977 | */ |
1936 | if (!global_reclaim(sc)) { | 1978 | if (!global_reclaim(sc) && |
1937 | mem_cgroup_iter_break(root, memcg); | 1979 | sc->nr_reclaimed >= sc->nr_to_reclaim) { |
1938 | break; | 1980 | mem_cgroup_iter_break(root, memcg); |
1939 | } | 1981 | break; |
1940 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 1982 | } |
1941 | } while (memcg); | 1983 | memcg = mem_cgroup_iter(root, memcg, &reclaim); |
1984 | } while (memcg); | ||
1985 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | ||
1986 | sc->nr_scanned - nr_scanned, sc)); | ||
1942 | } | 1987 | } |
1943 | 1988 | ||
1944 | /* Returns true if compaction should go ahead for a high-order request */ | 1989 | /* Returns true if compaction should go ahead for a high-order request */ |
@@ -1958,7 +2003,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | |||
1958 | * a reasonable chance of completing and allocating the page | 2003 | * a reasonable chance of completing and allocating the page |
1959 | */ | 2004 | */ |
1960 | balance_gap = min(low_wmark_pages(zone), | 2005 | balance_gap = min(low_wmark_pages(zone), |
1961 | (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2006 | (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / |
1962 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2007 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
1963 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); | 2008 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); |
1964 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); | 2009 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); |
@@ -2150,6 +2195,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2150 | goto out; | 2195 | goto out; |
2151 | 2196 | ||
2152 | /* | 2197 | /* |
2198 | * If we're getting trouble reclaiming, start doing | ||
2199 | * writepage even in laptop mode. | ||
2200 | */ | ||
2201 | if (sc->priority < DEF_PRIORITY - 2) | ||
2202 | sc->may_writepage = 1; | ||
2203 | |||
2204 | /* | ||
2153 | * Try to write back as many pages as we just scanned. This | 2205 | * Try to write back as many pages as we just scanned. This |
2154 | * tends to cause slow streaming writers to write data to the | 2206 | * tends to cause slow streaming writers to write data to the |
2155 | * disk smoothly, at the dirtying rate, which is nice. But | 2207 | * disk smoothly, at the dirtying rate, which is nice. But |
@@ -2300,7 +2352,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2300 | { | 2352 | { |
2301 | unsigned long nr_reclaimed; | 2353 | unsigned long nr_reclaimed; |
2302 | struct scan_control sc = { | 2354 | struct scan_control sc = { |
2303 | .gfp_mask = gfp_mask, | 2355 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), |
2304 | .may_writepage = !laptop_mode, | 2356 | .may_writepage = !laptop_mode, |
2305 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2357 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2306 | .may_unmap = 1, | 2358 | .may_unmap = 1, |
@@ -2473,7 +2525,7 @@ static bool zone_balanced(struct zone *zone, int order, | |||
2473 | */ | 2525 | */ |
2474 | static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | 2526 | static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) |
2475 | { | 2527 | { |
2476 | unsigned long present_pages = 0; | 2528 | unsigned long managed_pages = 0; |
2477 | unsigned long balanced_pages = 0; | 2529 | unsigned long balanced_pages = 0; |
2478 | int i; | 2530 | int i; |
2479 | 2531 | ||
@@ -2484,7 +2536,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | |||
2484 | if (!populated_zone(zone)) | 2536 | if (!populated_zone(zone)) |
2485 | continue; | 2537 | continue; |
2486 | 2538 | ||
2487 | present_pages += zone->present_pages; | 2539 | managed_pages += zone->managed_pages; |
2488 | 2540 | ||
2489 | /* | 2541 | /* |
2490 | * A special case here: | 2542 | * A special case here: |
@@ -2494,18 +2546,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | |||
2494 | * they must be considered balanced here as well! | 2546 | * they must be considered balanced here as well! |
2495 | */ | 2547 | */ |
2496 | if (zone->all_unreclaimable) { | 2548 | if (zone->all_unreclaimable) { |
2497 | balanced_pages += zone->present_pages; | 2549 | balanced_pages += zone->managed_pages; |
2498 | continue; | 2550 | continue; |
2499 | } | 2551 | } |
2500 | 2552 | ||
2501 | if (zone_balanced(zone, order, 0, i)) | 2553 | if (zone_balanced(zone, order, 0, i)) |
2502 | balanced_pages += zone->present_pages; | 2554 | balanced_pages += zone->managed_pages; |
2503 | else if (!order) | 2555 | else if (!order) |
2504 | return false; | 2556 | return false; |
2505 | } | 2557 | } |
2506 | 2558 | ||
2507 | if (order) | 2559 | if (order) |
2508 | return balanced_pages >= (present_pages >> 2); | 2560 | return balanced_pages >= (managed_pages >> 2); |
2509 | else | 2561 | else |
2510 | return true; | 2562 | return true; |
2511 | } | 2563 | } |
@@ -2564,7 +2616,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2564 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | 2616 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
2565 | int *classzone_idx) | 2617 | int *classzone_idx) |
2566 | { | 2618 | { |
2567 | struct zone *unbalanced_zone; | 2619 | bool pgdat_is_balanced = false; |
2568 | int i; | 2620 | int i; |
2569 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2621 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2570 | unsigned long total_scanned; | 2622 | unsigned long total_scanned; |
@@ -2595,9 +2647,6 @@ loop_again: | |||
2595 | 2647 | ||
2596 | do { | 2648 | do { |
2597 | unsigned long lru_pages = 0; | 2649 | unsigned long lru_pages = 0; |
2598 | int has_under_min_watermark_zone = 0; | ||
2599 | |||
2600 | unbalanced_zone = NULL; | ||
2601 | 2650 | ||
2602 | /* | 2651 | /* |
2603 | * Scan in the highmem->dma direction for the highest | 2652 | * Scan in the highmem->dma direction for the highest |
@@ -2638,8 +2687,11 @@ loop_again: | |||
2638 | zone_clear_flag(zone, ZONE_CONGESTED); | 2687 | zone_clear_flag(zone, ZONE_CONGESTED); |
2639 | } | 2688 | } |
2640 | } | 2689 | } |
2641 | if (i < 0) | 2690 | |
2691 | if (i < 0) { | ||
2692 | pgdat_is_balanced = true; | ||
2642 | goto out; | 2693 | goto out; |
2694 | } | ||
2643 | 2695 | ||
2644 | for (i = 0; i <= end_zone; i++) { | 2696 | for (i = 0; i <= end_zone; i++) { |
2645 | struct zone *zone = pgdat->node_zones + i; | 2697 | struct zone *zone = pgdat->node_zones + i; |
@@ -2689,7 +2741,7 @@ loop_again: | |||
2689 | * of the zone, whichever is smaller. | 2741 | * of the zone, whichever is smaller. |
2690 | */ | 2742 | */ |
2691 | balance_gap = min(low_wmark_pages(zone), | 2743 | balance_gap = min(low_wmark_pages(zone), |
2692 | (zone->present_pages + | 2744 | (zone->managed_pages + |
2693 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2745 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / |
2694 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2746 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
2695 | /* | 2747 | /* |
@@ -2720,12 +2772,10 @@ loop_again: | |||
2720 | } | 2772 | } |
2721 | 2773 | ||
2722 | /* | 2774 | /* |
2723 | * If we've done a decent amount of scanning and | 2775 | * If we're getting trouble reclaiming, start doing |
2724 | * the reclaim ratio is low, start doing writepage | 2776 | * writepage even in laptop mode. |
2725 | * even in laptop mode | ||
2726 | */ | 2777 | */ |
2727 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 2778 | if (sc.priority < DEF_PRIORITY - 2) |
2728 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | ||
2729 | sc.may_writepage = 1; | 2779 | sc.may_writepage = 1; |
2730 | 2780 | ||
2731 | if (zone->all_unreclaimable) { | 2781 | if (zone->all_unreclaimable) { |
@@ -2734,17 +2784,7 @@ loop_again: | |||
2734 | continue; | 2784 | continue; |
2735 | } | 2785 | } |
2736 | 2786 | ||
2737 | if (!zone_balanced(zone, testorder, 0, end_zone)) { | 2787 | if (zone_balanced(zone, testorder, 0, end_zone)) |
2738 | unbalanced_zone = zone; | ||
2739 | /* | ||
2740 | * We are still under min water mark. This | ||
2741 | * means that we have a GFP_ATOMIC allocation | ||
2742 | * failure risk. Hurry up! | ||
2743 | */ | ||
2744 | if (!zone_watermark_ok_safe(zone, order, | ||
2745 | min_wmark_pages(zone), end_zone, 0)) | ||
2746 | has_under_min_watermark_zone = 1; | ||
2747 | } else { | ||
2748 | /* | 2788 | /* |
2749 | * If a zone reaches its high watermark, | 2789 | * If a zone reaches its high watermark, |
2750 | * consider it to be no longer congested. It's | 2790 | * consider it to be no longer congested. It's |
@@ -2753,8 +2793,6 @@ loop_again: | |||
2753 | * speculatively avoid congestion waits | 2793 | * speculatively avoid congestion waits |
2754 | */ | 2794 | */ |
2755 | zone_clear_flag(zone, ZONE_CONGESTED); | 2795 | zone_clear_flag(zone, ZONE_CONGESTED); |
2756 | } | ||
2757 | |||
2758 | } | 2796 | } |
2759 | 2797 | ||
2760 | /* | 2798 | /* |
@@ -2766,17 +2804,9 @@ loop_again: | |||
2766 | pfmemalloc_watermark_ok(pgdat)) | 2804 | pfmemalloc_watermark_ok(pgdat)) |
2767 | wake_up(&pgdat->pfmemalloc_wait); | 2805 | wake_up(&pgdat->pfmemalloc_wait); |
2768 | 2806 | ||
2769 | if (pgdat_balanced(pgdat, order, *classzone_idx)) | 2807 | if (pgdat_balanced(pgdat, order, *classzone_idx)) { |
2808 | pgdat_is_balanced = true; | ||
2770 | break; /* kswapd: all done */ | 2809 | break; /* kswapd: all done */ |
2771 | /* | ||
2772 | * OK, kswapd is getting into trouble. Take a nap, then take | ||
2773 | * another pass across the zones. | ||
2774 | */ | ||
2775 | if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { | ||
2776 | if (has_under_min_watermark_zone) | ||
2777 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | ||
2778 | else if (unbalanced_zone) | ||
2779 | wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10); | ||
2780 | } | 2810 | } |
2781 | 2811 | ||
2782 | /* | 2812 | /* |
@@ -2788,9 +2818,9 @@ loop_again: | |||
2788 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) | 2818 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) |
2789 | break; | 2819 | break; |
2790 | } while (--sc.priority >= 0); | 2820 | } while (--sc.priority >= 0); |
2791 | out: | ||
2792 | 2821 | ||
2793 | if (!pgdat_balanced(pgdat, order, *classzone_idx)) { | 2822 | out: |
2823 | if (!pgdat_is_balanced) { | ||
2794 | cond_resched(); | 2824 | cond_resched(); |
2795 | 2825 | ||
2796 | try_to_freeze(); | 2826 | try_to_freeze(); |
@@ -3053,7 +3083,7 @@ unsigned long global_reclaimable_pages(void) | |||
3053 | nr = global_page_state(NR_ACTIVE_FILE) + | 3083 | nr = global_page_state(NR_ACTIVE_FILE) + |
3054 | global_page_state(NR_INACTIVE_FILE); | 3084 | global_page_state(NR_INACTIVE_FILE); |
3055 | 3085 | ||
3056 | if (nr_swap_pages > 0) | 3086 | if (get_nr_swap_pages() > 0) |
3057 | nr += global_page_state(NR_ACTIVE_ANON) + | 3087 | nr += global_page_state(NR_ACTIVE_ANON) + |
3058 | global_page_state(NR_INACTIVE_ANON); | 3088 | global_page_state(NR_INACTIVE_ANON); |
3059 | 3089 | ||
@@ -3067,7 +3097,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) | |||
3067 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + | 3097 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + |
3068 | zone_page_state(zone, NR_INACTIVE_FILE); | 3098 | zone_page_state(zone, NR_INACTIVE_FILE); |
3069 | 3099 | ||
3070 | if (nr_swap_pages > 0) | 3100 | if (get_nr_swap_pages() > 0) |
3071 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + | 3101 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + |
3072 | zone_page_state(zone, NR_INACTIVE_ANON); | 3102 | zone_page_state(zone, NR_INACTIVE_ANON); |
3073 | 3103 | ||
@@ -3280,9 +3310,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3280 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 3310 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
3281 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 3311 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
3282 | .may_swap = 1, | 3312 | .may_swap = 1, |
3283 | .nr_to_reclaim = max_t(unsigned long, nr_pages, | 3313 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), |
3284 | SWAP_CLUSTER_MAX), | 3314 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), |
3285 | .gfp_mask = gfp_mask, | ||
3286 | .order = order, | 3315 | .order = order, |
3287 | .priority = ZONE_RECLAIM_PRIORITY, | 3316 | .priority = ZONE_RECLAIM_PRIORITY, |
3288 | }; | 3317 | }; |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 9800306c8195..e1d8ed172c42 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -142,7 +142,7 @@ int calculate_normal_threshold(struct zone *zone) | |||
142 | * 125 1024 10 16-32 GB 9 | 142 | * 125 1024 10 16-32 GB 9 |
143 | */ | 143 | */ |
144 | 144 | ||
145 | mem = zone->present_pages >> (27 - PAGE_SHIFT); | 145 | mem = zone->managed_pages >> (27 - PAGE_SHIFT); |
146 | 146 | ||
147 | threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); | 147 | threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); |
148 | 148 | ||
@@ -628,7 +628,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = { | |||
628 | #ifdef CONFIG_CMA | 628 | #ifdef CONFIG_CMA |
629 | "CMA", | 629 | "CMA", |
630 | #endif | 630 | #endif |
631 | #ifdef CONFIG_MEMORY_ISOLATION | ||
631 | "Isolate", | 632 | "Isolate", |
633 | #endif | ||
632 | }; | 634 | }; |
633 | 635 | ||
634 | static void *frag_start(struct seq_file *m, loff_t *pos) | 636 | static void *frag_start(struct seq_file *m, loff_t *pos) |
@@ -768,7 +770,6 @@ const char * const vmstat_text[] = { | |||
768 | "kswapd_inodesteal", | 770 | "kswapd_inodesteal", |
769 | "kswapd_low_wmark_hit_quickly", | 771 | "kswapd_low_wmark_hit_quickly", |
770 | "kswapd_high_wmark_hit_quickly", | 772 | "kswapd_high_wmark_hit_quickly", |
771 | "kswapd_skip_congestion_wait", | ||
772 | "pageoutrun", | 773 | "pageoutrun", |
773 | "allocstall", | 774 | "allocstall", |
774 | 775 | ||
@@ -890,7 +891,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, | |||
890 | int mtype; | 891 | int mtype; |
891 | unsigned long pfn; | 892 | unsigned long pfn; |
892 | unsigned long start_pfn = zone->zone_start_pfn; | 893 | unsigned long start_pfn = zone->zone_start_pfn; |
893 | unsigned long end_pfn = start_pfn + zone->spanned_pages; | 894 | unsigned long end_pfn = zone_end_pfn(zone); |
894 | unsigned long count[MIGRATE_TYPES] = { 0, }; | 895 | unsigned long count[MIGRATE_TYPES] = { 0, }; |
895 | 896 | ||
896 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 897 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index fd05c81cb348..de2e950a0a7a 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c | |||
@@ -87,7 +87,7 @@ struct virtio_chan { | |||
87 | /* This is global limit. Since we don't have a global structure, | 87 | /* This is global limit. Since we don't have a global structure, |
88 | * will be placing it in each channel. | 88 | * will be placing it in each channel. |
89 | */ | 89 | */ |
90 | int p9_max_pages; | 90 | unsigned long p9_max_pages; |
91 | /* Scatterlist: can be too big for stack. */ | 91 | /* Scatterlist: can be too big for stack. */ |
92 | struct scatterlist sg[VIRTQUEUE_NUM]; | 92 | struct scatterlist sg[VIRTQUEUE_NUM]; |
93 | 93 | ||
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a5b89a6fec6d..7427ab5e27d8 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/vmalloc.h> | 21 | #include <linux/vmalloc.h> |
22 | #include <linux/export.h> | 22 | #include <linux/export.h> |
23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
24 | #include <linux/pm_runtime.h> | ||
24 | 25 | ||
25 | #include "net-sysfs.h" | 26 | #include "net-sysfs.h" |
26 | 27 | ||
@@ -1257,6 +1258,8 @@ void netdev_unregister_kobject(struct net_device * net) | |||
1257 | 1258 | ||
1258 | remove_queue_kobjects(net); | 1259 | remove_queue_kobjects(net); |
1259 | 1260 | ||
1261 | pm_runtime_set_memalloc_noio(dev, false); | ||
1262 | |||
1260 | device_del(dev); | 1263 | device_del(dev); |
1261 | } | 1264 | } |
1262 | 1265 | ||
@@ -1301,6 +1304,8 @@ int netdev_register_kobject(struct net_device *net) | |||
1301 | return error; | 1304 | return error; |
1302 | } | 1305 | } |
1303 | 1306 | ||
1307 | pm_runtime_set_memalloc_noio(dev, true); | ||
1308 | |||
1304 | return error; | 1309 | return error; |
1305 | } | 1310 | } |
1306 | 1311 | ||